diff --git a/docs/regular-expressions.md b/docs/regular-expressions.md new file mode 100644 index 0000000000..484f34d1cc --- /dev/null +++ b/docs/regular-expressions.md @@ -0,0 +1,322 @@ +--- +title: Regular expressions in Microsoft Power Fx | Microsoft Docs +description: Reference information about working with regular expressions in Microsoft Power Fx +author: gregli-msft +ms.topic: conceptual +ms.reviewer: jdaly +ms.date: 1/31/2025 +ms.subservice: power-fx +ms.author: gregli +search.audienceType: + - maker +contributors: + - gregli-msft + - mduelae + - gregli +--- +# Regular expressions + +The [**IsMatch**, **Match**, and **MatchAll** functions](reference/function-ismatch.md) are used to extract and validate patterns in text. The pattern they use is called a [regular expression](https://en.wikipedia.org/wiki/Regular_expression). + +Regular expressions have a long history, are very powerful, are available in many programming languages, and used for a wide variety of purposes. They also often look like a random sequence of punctuation marks. This article doesn't describe all aspects of regular expressions, but a wealth of information, tutorials, and tools are available online. + +Every programming language has its own dialect of regular expressions and there are few standards. As much as possible, we would like the same regular expression to give the same result across all Power Fx implementations. That isn't easy to accomplish as Power Fx runs on top of JavaScript and .NET which have significant differences. To accommodate running on different platforms, Power Fx regular expressions are limited to a subset of features that are widely supported across the industry. + +Power Fx will produce an authoring time error when unsupported features are encountered. This is one of the reasons that the regular expression and options must be a authoring time constant and not dynamic (for example, provided in a variable). + +## Supported features + +Power Fx supports the following regular expression features, with notes on how Power Fx behavior may differ from other systems. + +The regular expression must be a constant and not calculated or stored in a variable. Using the `&` operator and the `Concatenate`, `Char`, and `UniChar` functions with constant arguments is supported. + +### Literal characters + +| Feature | Description | +|---------|---------| +| Literal characters | Any Unicode character can be inserted directly, except `\`, `[`, `]`, `^`, `$`, `.`, `|`, `?`, `*`, `+`, `(`, `)`, `{`, and `}`. When using **MatchOptions.FreeSpacing**, `#`, ` `, and other `\s` space characters must be escaped as they have a different meaning. | +| Escaped literal characters | `\` (backslash) followed by one of the direct literal characters, such as `\?` to insert a question mark. `\#` and `\ ` may also be used even when **MatchOptions.FreeSpacing** is disabled for consistency. | +| Control characters | `\cA`, where the control characters is `A` through `Z`, upper or lowercase. | +| Hexadecimal and Unicode character codes | `\x20` with two hexadecimal digits, `\u2028` with four hexadecimal digits. | +| Carriage return | `\r`, the same as `Char(13)`. | +| Newline character | `\n`, the same as `Char(10)`. | +| Form feed | `\f`, the same as `Char(12)`. | +| Horizontal Tab | `\t`, the same as `Char(9)`. | + +Octal codes for characters, such as `\044` or `\o{044}` are disallowed, as they can be ambiguous with numbered back references. Use `\x` or `\u` instead. + +`\v` is not supported as it ambiguous across regular expression languages. Use `\x0b` for a vertical tab or `[\x0b\f\r\n\u2028\u2029]` for vertical whitespace. + +### Assertions + +Assertions match a particular position in the text, but do not consume any characters. + +| Feature | Description | +|---------|---------| +| Start of line | `^`, matches the beginning of the text, or of a line if **MatchOptions.Multiline** is used. | +| End of line | `$`, matches the end of the text, or of a line if **MatchOptions.Multiline** is used. | +| Lookahead | `(?=a)` and `(?!a)`, matches ahead for a pattern. +| Lookbehind | `(?<=b)` and `(?chars)` captures a sub-match with the name `name`, referenced with `\k`. Cannot be used if **MatchOptions.NumberedSubMatches** is enabled. | +| Numbered group and back reference | When **MatchOptions.NumberedSubMatches** is enabled, `(a)` captures a sub-match referenced with `\1`. | +| Non-capture group | `(?:a)`, creates group without capturing the result as a named or numbered sub-match. All groups are non-capturing unless **MatchOptions.NumberedSubMatches** is enabled. | + +Named and numbered sub-matches cannot be used together. By default, named sub-matches are enabled and are preferred for clarity and maintainability, while standard capture groups become non capture groups with improved performance. This can be changed with **MatchOptions.NumberedSubMatches** which provides for traditional capture groups but disables named captures groups. Some implementations treat a mix of numbered and named capture groups differently which is why Power Fx disallows it. + +Self referencing capture groups are not supported, for example the regular expression `(a\1)`. + +Two capture groups cannot share the same name, for example the regular expression `(?\w+)|(?\d+)` is not supported. + +Some implementations offer an "explicit capture" option to improve performance which is unnecessary in Power Fx as it is effectively the default. **MatchOptions.NumberedSubMatches** disables it and enables implicit numbered captures. + +In some situations, in particular with 0 match allowed quantifiers, **SubMatchess** can return different results between two different Power Fx implementations. See [Differences between implementations](#differences-between-implementations) for more information. + +### Comments + +| Feature | Description | +|---------|---------| +| Inline comments | `(?# comment here)`, which is ignored as a comment. Comment ends with the next close parenthesis, even if an opening parenthesis is in the comment.| + +See **MatchOptions.FreeSpacing** for an alternative for formatting and commenting regular expressions. + +### Inline options + +| Feature | Description | +|---------|---------| +| Inline options | `(?im)` is the same as using **MatchOptions.IgnoreCase** and **MatchOptions.Multiline**. Must be set at the beginning of the regular expression. | + +Supported inline modes are `[imsx]`, corresponding to **MatchOptions.IgnoreCase**, **MatchOptions.Multiline**, **MatchOptions.DotAll**, and **MatchOptions.FreeSpacing**, respectively. `n` is also accepted for compatibility but has no effect as it is the default and is incompatible with **MatchOptions.NumberedSubMatches**. + +Inline options cannot be used to disable an option or set an option for a sub-expression. + +## Options + +Match options change the behavior of regular expression matching. There are two ways to enable options, which can be mixed so long as there is no conflict: +- **MatchOptions** enum value passed as the third argument to **Match**, **MatchAll**, and **IsMatch**. Options can be combined with the `&` operator or `Concatenate` function, for example `MatchOptions.DotAll & MatchOptions.FreeSpacing`. All of the regular expression functions requires that **MatchOptions** is a constant value, it cannot be calculated or stored in a variable. +- `(?...)` prefix at the very beginning of the regular expression. Options can be combined with multiple letters in the `(?...)` construct, for example `(?sx)`. Some options do not have a `(?...)` equivalent but may have other ways to get the same effect, for example **MatchOptions.BeginsWith** is the equivalent of `^` at the beginning of the regular expression. + +### Contains + +Enabled with **MatchOptions.Contains** without a regular expression text equivalent. This is the default. + +### Complete + +Enabled with **MatchOptions.Complete** or use `^` and `$` at the beginning and of the regular expression, respectively. + +### BeginsWith + +Enabled with **MatchOptions.BeginsWith** or use `^` at the beginning and of the regular expression. + +### EndsWith + +Enabled with **MatchOptions.EndsWith** or use `$` at the end of the regular expression. + +### DotAll + +Enabled with **MatchOptions.DotAll** or `(?s)` at the start of the regular expression. + +Normally the dot `.` operator will match all characters except newline characters `Char(10)` and `Char(13)`. With the **DotAll** modifier, all characters are matched, including newlines. + +In this example, only the "Hello" is matched as the newline after it will not be matched by a `.` by default: + +```powerapps-dot +Trim( Match( "Hello + World", ".*" ).FullMatch ) +// returns +// "Hello" +``` + +But if we add the **DotAll** modifier, then the newline and all subsequent characters will be matched: + +```powerapps-dot +Trim( Match( "Hello + World", ".*", MatchOptions.DotAll ).FullMatch ) +// returns +// "Hello +// World" +``` + +### FreeSpacing + +Enabled with **MatchOptions.FreeSpacing** or `(?x)` at the start of a regular expression. + +Free spacing makes it easier to read and maintain a complex regular expression. The rules are simple: +- Space characters are ignored in the regular expression, including tabs and newline characters. If matching a space is desired, use `\s`, `\ `, `\t`, `\r`, or `\n`. +- `#` begins a comment which runs until the end of the line. It and all characters that follow up to the next newline character are ignored. +- Characters classes are not included in these changes. Space characters and `#` act as they normally do. For example, `IsMatch( "a#b c", "(?x)a[ #]b[ #]c" )` returns *true*. Some regular expression languages include character classes in free spacing, or provide an option to include them, but Power Fx does not at this time. + +For example, here is a complex regular expression for matching an ISO [8601 date time](https://en.wikipedia.org/wiki/ISO_8601): + +```powerapps-dot +IsMatch( + "2025-01-17T19:38:49+0000", + "^\d{4}-(0\d|1[012])-([012]\d|3[01])(T([01]\d|2[0123]):[0-5]\d(:[0-5]\d(\.\d{3})?)?(Z|[\-+]\d{4}))?$" +) +// returns true +``` + +And here is the identical regular expression with free spacing utilizing multiple lines, indentation for groups, and regular expression comments, making this version much easier to understand, validate, and maintain. + +```powerapps-dot +IsMatch( "2025-01-17T19:38:49+0000", + "(?x) # enables free spacing, must be very first + ^ # matches from beginning of text + \d{4} # year (0000-9999) + -(0\d|1[012]) # month (00-12) + -([012]\d|3[01]) # day (00-31, range not checked against month) + (T([01]\d|2[0123]) # optional time, starting with hours (00-23) + :[0-5]\d # minutes (00-59) + (:[0-5]\d # optional seconds (00-59) + (\.\d{3})? # optional milliseconds (000-999) + )? + (Z|[\-+]\d{4}) # time zone + )? + $ # matches to end of text + " +) +// returns true +``` + +### IgnoreCase + +Enabled with **MatchOptions.IgnoreCase** or `(?i)` at the start of a regular expression. + +Matches text in a letter case insensitive: upper case letters match lower case letters and lower case letters match upper case letters. + +For example: + +```powerapps-dot +IsMatch( "HELLO!", "hello", MatchOptions.IgnoreCase ) +// returns true + +IsMatch( "file://c:/temp/info.txt", "^FILE://", MatchOptions.IgnoreCase ) +// returns true +``` + +Most parts or Power Fx are culture aware, but not here. Using culture invariant matching is the industry standard for regular expressions, including in JavaScript and Perl. It is particularly useful in the second example where a system resource is being matched, in for example the `tr-TR` culture where `I` is not the uppercase equivalent of `i`. + +If a culture aware, case insensitive match is needed, use characters class with the matching characters instead, for example `[Hh][Ee][Ll][Ll][Oo]` for the first example. + +### Multiline + +Enabled with **MatchOptions.Multiline** or `(?m)` at the start of a regular expression. + +Normally, `^` and `$` anchors match the beginning and of the input text. With the **Multiline** modifier, these anchors will match the beginning and end of lines in the input text, where each line ends with `\r`, `\n`, `\r\n`, or the end of the input. For example: + +```powerapps-dot +MatchAll( "Hello" & Char(13) & Char(10) & "World", "^.+$" ) +// returns +// "Hello" +``` + +### NumberedSubMatches + +Enabled with **MatchOptions.NumberedSubMatches** with no inline option. `(?n)` is supported as the opposite of this option for compatibility and is the default. + +By default, `(...)` does not capture, the equivalent of what most systems call "explicit capture". To capture, use a named capture with `(?...)` with backreference `\k`. This improves performance of the regular expression by not capturing gruops that do not need to be captures and improving clarity by using names instead of numbers that can change. + +If you have an existing regular expression, it may depend on groups being captured automatically and numbered, including numbered back references. This is available by using the **MatchOptions.NumberedSubMaches** option. + +Named and numbered sub-matches cannot be used together. Some implementations treat a mix of numbered and named capture groups differently which is why Power Fx disallows it. + +## Differences between implementations + +As stated in the introduction, Power Fx's regular expressions are intentionally limited to features that can be consistently implemented on top of .NET, JavaScript, and other programming language regular expression engines. Authoring time errors prevent use of features that are not a part of this set. + +Despite this, although a feature may be supported across all implementations of Power Fx, there may be small semantic differences in how each of the implementations behaves. + +In general, **FullMatch** will be consistent across all implementations. Differences emerge with **SubMatches**, either named or numbered, in particular when used with quantifiers that includes a zero match possibility (for example, `*`, `?`, and `{0,5}`) that could be satisfied in multiple ways. If a backreference is used to one of these, then the **FullMatch** may also be different. + +To avoid these differences in your formulas: +- Avoid quantifiers outside of a **SubMatch**. +- Test your regular expressions thoroughly, especially those involving **SubMatches** and backreferences. +- Be particularly careful if your Power Fx regular expression is used in a module across products. + +Let's look at some examples: + +```powerapps-dot +Match( "ab", "(a*)+(b)" , MatchOptions.NumberedSubMatches ) +// returns with .NET: {FullMatch:"ab", StartMatch:1, SubMatches:["", "b"]} +// returns with JavaScript: {FullMatch:"ab", StartMatch:1, SubMatches:["a", "b"]} +``` + +On a Power Fx implementation based on .NET, **Index( SubMatches, 2 )** will return an empty string `""`, while using JavaScript returns `"a"`. This difference is caused by how the different engines treat the `(a*)+`. In JavaScript it is satisfied by a single `a` as one iteration of the `+`, but on .NET it is satisfied by two iterations of the `+` with `a` followed by an empty string. The **FullMatch** is the same for both implementations as `"ax"` but we get there through two different paths. + +Here's an example with a backreference: + +```powerapps-dot +Match( "ab", "(a)*b\1" , MatchOptions.NumberedSubMatches ) +// returns with .NET: Blank() +// returns with JavaScript: {FullMatch:"b", StartMatch:2, SubMatches:[Blank()]} +``` + +Normally the engine is greedy and will match the `a` in the sub match. But this won't satisfy the full regular expression, and so .NET returns `blank`. JavaScript decides that the sub match isn't possible, so it chooses a different path where the sub match didn't happen at all. + +Without the backreference, or with the `*` inside the sub match, we have consistency between implementations: + +```powerapps-dot +>> Match( "ab", "(a)*b" , MatchOptions.NumberedSubMatches ) +// returns: {FullMatch:"ab", StartMatch:1, SubMatches:["a"]} + +>> Match( "ab", "(a*)b\1" , MatchOptions.NumberedSubMatches ) +// returns: {FullMatch:"b", StartMatch:2, SubMatches:[""]} +``` + + diff --git a/src/libraries/Microsoft.PowerFx.Core/Binding/BinderUtils.cs b/src/libraries/Microsoft.PowerFx.Core/Binding/BinderUtils.cs index b9027200df..25d2f0ccbd 100644 --- a/src/libraries/Microsoft.PowerFx.Core/Binding/BinderUtils.cs +++ b/src/libraries/Microsoft.PowerFx.Core/Binding/BinderUtils.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. using System.Collections.Generic; +using System.Globalization; using System.Linq; using Microsoft.CodeAnalysis; using Microsoft.PowerFx.Core.App.Controls; @@ -14,6 +15,7 @@ using Microsoft.PowerFx.Core.Localization; using Microsoft.PowerFx.Core.Logging.Trackers; using Microsoft.PowerFx.Core.Texl; +using Microsoft.PowerFx.Core.Texl.Builtins; using Microsoft.PowerFx.Core.Types; using Microsoft.PowerFx.Core.Types.Enums; using Microsoft.PowerFx.Core.Utils; @@ -1540,6 +1542,49 @@ public static bool TryGetConstantValue(CheckTypesContext context, TexlNode node, nodeValue = string.Join(string.Empty, parameters); return true; } + } + else if (callNode.Head.Name.Value == BuiltinFunctionsCore.Char.Name && callNode.Args.Children.Count == 1) + { + int val = -1; + + if (callNode.Args.Children[0].Kind == NodeKind.DecLit) + { + val = (int)((DecLitNode)callNode.Args.Children[0]).ActualDecValue; + } + else if (callNode.Args.Children[0].Kind == NodeKind.NumLit) + { + val = (int)((NumLitNode)callNode.Args.Children[0]).ActualNumValue; + } + + if (val < 1 || val > 255) + { + return false; + } + + nodeValue = ((char)val).ToString(); + return true; + } + else if (callNode.Head.Name.Value == BuiltinFunctionsCore.UniChar.Name && callNode.Args.Children.Count == 1) + { + int val = -1; + + if (callNode.Args.Children[0].Kind == NodeKind.DecLit) + { + val = (int)((DecLitNode)callNode.Args.Children[0]).ActualDecValue; + } + else if (callNode.Args.Children[0].Kind == NodeKind.NumLit) + { + val = (int)((NumLitNode)callNode.Args.Children[0]).ActualNumValue; + } + + // partial surrogate pair not supported, consistent with interpreter UniChar implementation + if (val < 1 || val > 0x10FFFF || (val >= 0xD800 && val <= 0xDFFF)) + { + return false; + } + + nodeValue = char.ConvertFromUtf32(val); + return true; } break; diff --git a/src/libraries/Microsoft.PowerFx.Core/Localization/Strings.cs b/src/libraries/Microsoft.PowerFx.Core/Localization/Strings.cs index ebbeeb730f..5f738365fe 100644 --- a/src/libraries/Microsoft.PowerFx.Core/Localization/Strings.cs +++ b/src/libraries/Microsoft.PowerFx.Core/Localization/Strings.cs @@ -741,7 +741,41 @@ internal static class TexlStrings public static ErrorResourceKey ErrDecimalRequiresPowerFxV1 = new ErrorResourceKey("ErrDecimalNeedsPowerFxV1"); public static ErrorResourceKey ErrInvalidRegEx = new ErrorResourceKey("ErrInvalidRegEx"); + public static ErrorResourceKey ErrInvalidRegExBadInlineOptions = new ErrorResourceKey("ErrInvalidRegExBadInlineOptions"); + public static ErrorResourceKey ErrInvalidRegExInlineOptionNotAtStart = new ErrorResourceKey("ErrInvalidRegExInlineOptionNotAtStart"); + public static ErrorResourceKey ErrInvalidRegExBadOctal = new ErrorResourceKey("ErrInvalidRegExBadOctal"); + public static ErrorResourceKey ErrInvalidRegExBadCurly = new ErrorResourceKey("ErrInvalidRegExBadCurly"); + public static ErrorResourceKey ErrInvalidRegExBadSquare = new ErrorResourceKey("ErrInvalidRegExBadSquare"); + public static ErrorResourceKey ErrInvalidRegExBadParen = new ErrorResourceKey("ErrInvalidRegExBadParen"); + public static ErrorResourceKey ErrInvalidRegExBadEscapeInsideCharacterClass = new ErrorResourceKey("ErrInvalidRegExBadEscapeInsideCharacterClass"); + public static ErrorResourceKey ErrInvalidRegExBadEscapeInsideNegativeCharacterClass = new ErrorResourceKey("ErrInvalidRegExBadEscapeInsideNegativeCharacterClass"); + public static ErrorResourceKey ErrInvalidRegExBadEscapeOutsideCharacterClass = new ErrorResourceKey("ErrInvalidRegExBadEscapeOutsideCharacterClass"); + public static ErrorResourceKey ErrInvalidRegExRepeatInCharClass = new ErrorResourceKey("ErrInvalidRegExRepeatInCharClass"); + public static ErrorResourceKey ErrInvalidRegExRepeatedInlineOption = new ErrorResourceKey("ErrInvalidRegExRepeatedInlineOption"); + public static ErrorResourceKey ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches = new ErrorResourceKey("ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches"); + public static ErrorResourceKey ErrInvalidRegExConflictingInlineOptions = new ErrorResourceKey("ErrInvalidRegExConflictingInlineOptions"); + public static ErrorResourceKey ErrInvalidRegExBadQuantifier = new ErrorResourceKey("ErrInvalidRegExBadQuantifier"); + public static ErrorResourceKey ErrInvalidRegExBadExactQuantifier = new ErrorResourceKey("ErrInvalidRegExBadExactQuantifier"); + public static ErrorResourceKey ErrInvalidRegExBadBackRefSelfReferencing = new ErrorResourceKey("ErrInvalidRegExBadBackRefSelfReferencing"); + public static ErrorResourceKey ErrInvalidRegExBadBackRefNotDefined = new ErrorResourceKey("ErrInvalidRegExBadBackRefNotDefined"); + public static ErrorResourceKey ErrInvalidRegExBadBalancing = new ErrorResourceKey("ErrInvalidRegExBadBalancing"); + public static ErrorResourceKey ErrInvalidRegExBadSingleQuoteNamedCapture = new ErrorResourceKey("ErrInvalidRegExBadSingleQuoteNamedCapture"); + public static ErrorResourceKey ErrInvalidRegExBadEscape = new ErrorResourceKey("ErrInvalidRegExBadEscape"); + public static ErrorResourceKey ErrInvalidRegExBadConditional = new ErrorResourceKey("ErrInvalidRegExBadConditional"); + public static ErrorResourceKey ErrInvalidRegExBadNamedCaptureAlreadyExists = new ErrorResourceKey("ErrInvalidRegExBadNamedCaptureAlreadyExists"); + public static ErrorResourceKey ErrInvalidRegExBadNamedCaptureName = new ErrorResourceKey("ErrInvalidRegExBadNamedCaptureName"); + public static ErrorResourceKey ErrInvalidRegExUnclosedCaptureGroups = new ErrorResourceKey("ErrInvalidRegExUnclosedCaptureGroups"); + public static ErrorResourceKey ErrInvalidRegExUnclosedInlineComment = new ErrorResourceKey("ErrInvalidRegExUnclosedInlineComment"); + public static ErrorResourceKey ErrInvalidRegExUnopenedCaptureGroups = new ErrorResourceKey("ErrInvalidRegExUnopenedCaptureGroups"); + public static ErrorResourceKey ErrInvalidRegExMixingNamedAndNumberedSubMatches = new ErrorResourceKey("ErrInvalidRegExMixingNamedAndNumberedSubMatches"); + public static ErrorResourceKey ErrInvalidRegExNumberedSubMatchesDisabled = new ErrorResourceKey("ErrInvalidRegExNumberedSubMatchesDisabled"); + public static ErrorResourceKey ErrInvalidRegExLiteralHyphenInCharacterClass = new ErrorResourceKey("ErrInvalidRegExLiteralHyphenInCharacterClass"); + public static ErrorResourceKey ErrInvalidRegExUnescapedCharInCharacterClass = new ErrorResourceKey("ErrInvalidRegExUnescapedCharInCharacterClass"); + public static ErrorResourceKey ErrInvalidRegExBadUnicodeCategory = new ErrorResourceKey("ErrInvalidRegExBadUnicodeCategory"); + public static ErrorResourceKey ErrInvalidRegExEmptyCharacterClass = new ErrorResourceKey("ErrInvalidRegExEmptyCharacterClass"); + public static ErrorResourceKey ErrVariableRegEx = new ErrorResourceKey("ErrVariableRegEx"); + public static ErrorResourceKey ErrVariableRegExOptions = new ErrorResourceKey("ErrVariableRegExOptions"); public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedFullMatchField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedFullMatchField"); public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedSubMatchesField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedSubMatchesField"); public static ErrorResourceKey InfoRegExCaptureNameHidesPredefinedStartMatchField = new ErrorResourceKey("InfoRegExCaptureNameHidesPredefinedStartMatchField"); diff --git a/src/libraries/Microsoft.PowerFx.Core/Public/Values/StringValue.cs b/src/libraries/Microsoft.PowerFx.Core/Public/Values/StringValue.cs index 2dabf96f14..e4ff4a41d8 100644 --- a/src/libraries/Microsoft.PowerFx.Core/Public/Values/StringValue.cs +++ b/src/libraries/Microsoft.PowerFx.Core/Public/Values/StringValue.cs @@ -36,11 +36,6 @@ public override void Visit(IValueVisitor visitor) visitor.Visit(this); } - internal StringValue ToLower() - { - return new StringValue(IRContext.NotInSource(FormulaType.String), Value.ToLowerInvariant()); - } - public override void ToExpression(StringBuilder sb, FormulaValueSerializerSettings settings) { sb.Append($"\"{CharacterUtils.ExcelEscapeString(Value)}\""); diff --git a/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/IsMatch.cs b/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/IsMatch.cs deleted file mode 100644 index 06d037ad9e..0000000000 --- a/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/IsMatch.cs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -using System.Collections.Generic; -using Microsoft.PowerFx.Core.App.ErrorContainers; -using Microsoft.PowerFx.Core.Binding; -using Microsoft.PowerFx.Core.Functions; -using Microsoft.PowerFx.Core.Localization; -using Microsoft.PowerFx.Core.Types; -using Microsoft.PowerFx.Core.Types.Enums; -using Microsoft.PowerFx.Core.Utils; -using Microsoft.PowerFx.Syntax; - -#pragma warning disable SA1649 // File name should match first type name - -namespace Microsoft.PowerFx.Core.Texl.Builtins -{ - // IsMatch(text:s, format:s) - // Checks if the input text is of the correct format. - internal sealed class IsMatchFunction : BuiltinFunction - { - public override bool UseParentScopeForArgumentSuggestions => true; - - public override bool IsSelfContained => true; - - public override bool HasPreciseErrors => true; - - public IsMatchFunction() - : base("IsMatch", TexlStrings.AboutIsMatch, FunctionCategories.Text, DType.Boolean, 0, 2, 3, DType.String, BuiltInEnums.MatchEnum.FormulaType._type, BuiltInEnums.MatchOptionsEnum.FormulaType._type) - { - } - - public override IEnumerable GetRequiredEnumNames() - { - return new List() { LanguageConstants.MatchEnumString, LanguageConstants.MatchOptionsEnumString }; - } - - public override IEnumerable GetSignatures() - { - yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2 }; - yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2, TexlStrings.IsMatchArg3 }; - } - - public override void CheckSemantics(TexlBinding binding, TexlNode[] args, DType[] argTypes, IErrorContainer errors) - { - if ((argTypes[1].Kind != DKind.String && argTypes[1].Kind != DKind.OptionSetValue) || !binding.IsConstant(args[1])) - { - errors.EnsureError(args[1], TexlStrings.ErrVariableRegEx); - } - } - - public override bool HasSuggestionsForParam(int index) - { - Contracts.Assert(index >= 0); - - return index <= 2; - } - } -} - -#pragma warning restore SA1649 // File name should match first type name diff --git a/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs b/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs index 83f2945e2d..f7399f07bd 100644 --- a/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs +++ b/src/libraries/Microsoft.PowerFx.Core/Texl/Builtins/Match.cs @@ -4,6 +4,10 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Linq; +using System.Text; using System.Text.RegularExpressions; using Microsoft.PowerFx.Core.App.ErrorContainers; using Microsoft.PowerFx.Core.Binding; @@ -20,11 +24,26 @@ namespace Microsoft.PowerFx.Core.Texl.Builtins { + // IsMatch(text:s, regular_expression:s, [options:s]) + internal class IsMatchFunction : BaseMatchFunction + { + public IsMatchFunction() + : base("IsMatch", TexlStrings.AboutIsMatch, DType.Boolean, null) + { + } + + public override IEnumerable GetSignatures() + { + yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2 }; + yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2, TexlStrings.IsMatchArg3 }; + } + } + // Match(text:s, regular_expression:s, [options:s]) internal class MatchFunction : BaseMatchFunction { - public MatchFunction(RegexTypeCache regexCache) - : base("Match", TexlStrings.AboutMatch, DType.EmptyRecord, regexCache) + public MatchFunction(RegexTypeCache regexTypeCache) + : base("Match", TexlStrings.AboutMatch, DType.EmptyRecord, regexTypeCache) { } } @@ -32,14 +51,14 @@ public MatchFunction(RegexTypeCache regexCache) // MatchAll(text:s, regular_expression:s, [options:s]) internal class MatchAllFunction : BaseMatchFunction { - public MatchAllFunction(RegexTypeCache regexCache) - : base("MatchAll", TexlStrings.AboutMatchAll, DType.EmptyTable, regexCache) + public MatchAllFunction(RegexTypeCache regexTypeCache) + : base("MatchAll", TexlStrings.AboutMatchAll, DType.EmptyTable, regexTypeCache) { } } internal class BaseMatchFunction : BuiltinFunction - { + { private readonly ConcurrentDictionary> _regexTypeCache; private readonly string _cachePrefix; private readonly int _regexCacheSize; @@ -48,12 +67,17 @@ internal class BaseMatchFunction : BuiltinFunction public override bool SupportsParamCoercion => true; - public BaseMatchFunction(string functionName, TexlStrings.StringGetter aboutGetter, DType returnType, RegexTypeCache regexCache) + public override bool UseParentScopeForArgumentSuggestions => true; + + public BaseMatchFunction(string functionName, TexlStrings.StringGetter aboutGetter, DType returnType, RegexTypeCache regexTypeCache) : base(functionName, aboutGetter, FunctionCategories.Text, returnType, 0, 2, 3, DType.String, BuiltInEnums.MatchEnum.FormulaType._type, BuiltInEnums.MatchOptionsEnum.FormulaType._type) { - _cachePrefix = returnType.IsTable ? "tbl_" : "rec_"; - _regexTypeCache = regexCache.Cache; - _regexCacheSize = regexCache.CacheSize; + if (regexTypeCache != null) + { + _cachePrefix = returnType.IsTable ? "tbl_" : "rec_"; + _regexTypeCache = regexTypeCache.Cache; + _regexCacheSize = regexTypeCache.CacheSize; + } } public override IEnumerable GetSignatures() @@ -65,7 +89,14 @@ public BaseMatchFunction(string functionName, TexlStrings.StringGetter aboutGett public override IEnumerable GetRequiredEnumNames() { return new List() { LanguageConstants.MatchEnumString, LanguageConstants.MatchOptionsEnumString }; - } + } + + public override bool HasSuggestionsForParam(int index) + { + Contracts.Assert(index >= 0); + + return index <= 2; + } public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DType[] argTypes, IErrorContainer errors, out DType returnType, out Dictionary nodeToCoercedTypeMap) { @@ -77,21 +108,565 @@ public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DTyp Contracts.AssertValue(errors); bool fValid = base.CheckTypes(context, args, argTypes, errors, out returnType, out nodeToCoercedTypeMap); - Contracts.Assert(returnType.IsRecord || returnType.IsTable); - TexlNode regExNode = args[1]; + Contracts.Assert(returnType.IsRecord || returnType.IsTable || returnType == DType.Boolean); + + string regularExpressionOptions = string.Empty; + var regExNode = args[1]; - if ((argTypes[1].Kind != DKind.String && argTypes[1].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, regExNode, out var nodeValue)) + if ((argTypes[1].Kind != DKind.String && argTypes[1].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, regExNode, out var regularExpression)) { errors.EnsureError(regExNode, TexlStrings.ErrVariableRegEx); return false; } - string regularExpression = nodeValue; - return fValid && TryCreateReturnType(regExNode, regularExpression, errors, ref returnType); + if (context.Features.PowerFxV1CompatibilityRules && args.Length == 3 && + ((argTypes[2].Kind != DKind.String && argTypes[2].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, args[2], out regularExpressionOptions))) + { + errors.EnsureError(args[2], TexlStrings.ErrVariableRegExOptions); + return false; + } + + if (!context.Features.PowerFxV1CompatibilityRules) + { + // only used for the following analysis and type creation, not modified in the IR + regularExpressionOptions += "N"; + } + + string alteredOptions = regularExpressionOptions; + + return fValid && + (!context.Features.PowerFxV1CompatibilityRules || IsSupportedRegularExpression(regExNode, regularExpression, regularExpressionOptions, out alteredOptions, errors)) && + (returnType == DType.Boolean || TryCreateReturnType(regExNode, regularExpression, alteredOptions, errors, ref returnType)); + } + + private static readonly IReadOnlyCollection UnicodeCategories = new HashSet() + { + "L", "Lu", "Ll", "Lt", "Lm", "Lo", + "M", "Mn", "Mc", "Me", + "N", "Nd", "Nl", "No", + "P", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", + "S", "Sm", "Sc", "Sk", "So", + "Z", "Zs", "Zl", "Zp", + "Cc", "Cf", + + // "C", "Cs", "Co", "Cn", are left out for now until we have a good scenario, as they differ between implementations + }; + + // Power Fx regular expressions are limited to features that can be transpiled to native .NET (C# Interpreter), ECMAScript (Canvas), or PCRE2 (Excel). + // We want the same results everywhere for Power Fx, even if the underlying implementation is different. Even with these limits in place there are some minor semantic differences but we get as close as we can. + // These tests can be run through all three engines and the results compared with by setting ExpressionEvaluationTests.RegExCompareEnabled, a PCRE2 DLL and NodeJS must be installed on the system. + // + // In short, we use the insersection of canonical .NET regular expressions and ECMAScript 2024's "v" flag for escaping rules. + // Someday when "v" is more widely avaialble, we can support more of its features such as set subtraction. + // We chose to use canonical .NET instead of RegexOptions.ECMAScript because we wanted the unicode definitions for words. See https://learn.microsoft.com/dotnet/standard/base-types/regular-expression-options#ecmascript-matching-behavior + // + // In addition, Power Fx regular expressions are opinionated and try to eliminate some of the ambiguity in the common regular expression language: + // Numbered capture groups are disabled by default, and cannot be mixed with named capture groups. + // Octal character codes are not supported, use \x or \u instead. + // Literal ^, -, [, ], {, and } must be escaped when used in a character class. + // Escaping is only supported for special characters and unknown alphanumeric escape sequences are not supported. + // Unicode characters are used throughout. + // Newlines support Windows friendly \r\n as well as \r and \n. + // + // Features that are supported: + // Literal characters. Any character except the special characters [ ] \ ^ $ . | ? * + ( ) can be inserted directly. + // Escaped special characters. \ (backslash) followed by a special character to insert it directly, includes \- when in a character class. + // Operators + // Dot (.), matches everything except [\r\n] unless MatchOptions.DotAll is used. + // Anchors, ^ and $, matches the beginning and end of the string, or of a line if MatchOptions.Multiline is used. + // Quanitfiers + // Greedy quantifiers. ? matches 0 or 1 times, + matches 1 or more times, * matches 0 or more times, {3} matches exactly 3 times, {1,} matches at least 1 time, {1,3} matches between 1 and 3 times. By default, matching is "greedy" and the match will be as large as possible. + // Lazy quantifiers. Same as the greedy quantifiers followed by ?, for example *? or {1,3}?. With the lazy modifier, the match will be as small as possible. + // Alternation. a|b matches "a" or "b". + // Character classes + // Custom character class. [abc] list of characters, [a-fA-f0-9] range of characters, [^a-z] everything but these characters. Character classes cannot be nested, subtracted, or intersected, and the same special character cannot be repeated in the character class. + // Word characters and breaks. \w, \W, \b, \B, using the Unicode definition of letters [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}\p{Pc}\p{Lm}]. + // Digit characters. \d includes the digits 0-9 and \p{Nd}, \D matches everything except characters matched by \d. + // Space characters. \s includes spacing characters [ \r\n\t\f\x0B\x85\p{Z}], \S which matches everything except characters matched by \s, \r carriage return, \n newline, \t tab, \f form feed. + // Control characters. \cA, where the control character is [A-Za-z]. + // Hexadecimal and Unicode character codes. \x20 with two hexadecimal digits, \u2028 with four hexadecimal digits. + // Unicode character class and property. \p{Ll} matches all Unicode lowercase letters, while \P{Ll} matches everything that is not a Unicode lowercase letter. + // Capture groups + // Non capture group. (?:a), group without capturing the result as a named or numbered sub-match. + // Named group and back reference. (?chars) captures a sub-match with the name name, referenced with \k. Cannot be used if MatchOptions.NumberedSubMatches is enabled. + // Numbered group and back referencs. (a|b) captures a sub-match, referenced with \1. MatchOptions.NumberedSubMatches must be enabled. + // Lookahead and lookbehind. (?=a), (?!a), (?<=b), (? . + // will report an error and stop further processing. + // One might think that the "\a" could have matched , but it will match first because it is first in the RE. + // One might think that the "\(" could have matched , but the double backslashes will be consumed first, which is why it is important + // to gather all the matches in a linear scan from the beginning to the end. + // + // Three regular expressions are utilized: + // - escapeRE is a regular expression fragment that is shared by the other two, included at the beginning each of the others + // - generalRE is used outside of a character class + // - characterClassRE is used inside a character class + + const string escapeRE = + @" + # leading backslash, escape sequences + \\k<(?\w+)> | # named backreference + (?\\0\d*) | # \0 and octal are not accepted, ambiguous and not needed (use \x instead) + \\(?\d+) | # numeric backreference, must be enabled with MatchOptions.NumberedSubMatches + (?\\ + ([dfnrstw] | # standard regex character classes, missing from .NET are aAeGzZv (no XRegExp support), other common are u{} and o + [\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/] | # acceptable escaped characters with Unicode aware ECMAScript + [\#\ ] | # added for free spacing, always accepted for conssitency even in character classes, escape needs to be removed on Unicode aware ECMAScript + c[a-zA-Z] | # Ctrl character classes + x[0-9a-fA-F]{2} | # hex character, must be exactly 2 hex digits + u[0-9a-fA-F]{4})) | # Unicode characters, must be exactly 4 hex digits + \\(?[pP])\{(?[\w=:-]+)\} | # Unicode chaeracter classes, extra characters here for a better error message + (?\\[bB]) | # acceptable outside a character class, includes negative classes until we have character class subtraction, include \P for future MatchOptions.LocaleAware + (?\\[DWS]) | + (?\\[&\-!#%,;:<=>@`~\^]) | # https://262.ecma-international.org/#prod-ClassSetReservedPunctuator, others covered with goodEscape above + (?\\.) | # all other escaped characters are invalid and reserved for future use + "; + + var generalRE = new Regex( + escapeRE + + @" + # leading (?<, named captures + \(\?<(?[a-zA-Z][a-zA-Z\d]*)> | # named capture group, can only be letters and numbers and must start with a letter + (?\(\?(=|!|<=|\(\?<\w*-\w*>) | # .NET balancing captures are not supported + (?\(\?<[^>]*>) | # bad named capture name, didn't match goodNamedCapture + (?\(\?'[^']*') | # single quoted capture names are not supported + + # leading (?, misc + (?\(\?:) | # non-capture group, still need to track to match with closing paren + \A\(\?(?[imnsx]+)\) | # inline options + (?\(\?\#) | # inline comment + (?\(\?(\w+|\w*-\w+)[\:\)]) | # inline options, including disable of options + (?\(\?\() | # .NET conditional alternations are not supported + + # leading (, used for other special purposes + (?\([\?\+\*].?) | # everything else unsupported that could start with a (, includes atomic groups, recursion, subroutines, branch reset, and future features + + # leading ?\*\+, quantifiers + (?[\?\*\+][\+\*]) | # possessive (ends with +) and useless quantifiers (ends with *) + (?[\?\*\+]\??) | # greedy and lazy quantifiers + + # leading {, limited quantifiers + (?{\d+}[\+\*\?]) | # exact quantifier can't be used with a modifier + (?{\d+}) | # standard exact quantifier, no optional lazy + (?{\d+,\d*}[\+|\*]) | # possessive and useless quantifiers + (?{\d+,\d*}\??) | # standard limited quantifiers, with optional lazy + (?[{}]) | # more constrained, blocks {,3} and Java/Rust semantics that does not treat this as a literal + + # character class + (?\[\]|\[^\]) | # some implementations support empty character class, with varying semantics; we do not + \[(?(\\\]|\\\[|[^\]\[])+)\] | # does not accept empty character class + (?[\[\]]) | # square brackets that are not escaped and didn't define a character class + + # open and close regions + (?\() | + (?\)) | + (?\#) | # used in free spacing mode (to detect start of comment), ignored otherwise + (?[\r\n]) # used in free spacing mode (to detect end of comment), ignored otherwise + ", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); + + var characterClassRE = new Regex( + escapeRE + + @" + (?^-|-$) | # begin/end literal hyphen not allowed within character class, needs to be escaped (ECMAScript v) + (? \/ | \| | \\ | # https://262.ecma-international.org/#prod-ClassSetSyntaxCharacter + \{ | \} | \( | \) | \[ | \] | \^) | # adding ^ for Power Fx, making it clear that the carets in [^^] have different meanings + (? << | == | >> | :: | # reserved pairs, see https://262.ecma-international.org/#prod-ClassSetReservedDoublePunctuator + @@ | `` | ~~ | %% | && | ;; | ,, | !! | # and https://www.unicode.org/reports/tr18/#Subtraction_and_Intersection + \|\| | \#\# | \$\$ | \*\* | \+\+ | \.\. | # includes set subtraction + \?\? | \^\^ | \-\-) + ", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); + + int captureNumber = 0; // last numbered capture encountered + var captureStack = new Stack(); // stack of all open capture groups, including null for non capturing groups, for detecting if a named group is closed + var captureNames = new List(); // list of seen named groups, does not included numbered groups or non capture groups + + bool openPoundComment = false; // there is an open end-of-line pound comment, only in freeFormMode + bool openInlineComment = false; // there is an open inline comment + + foreach (Match token in generalRE.Matches(regexPattern)) + { + void RegExError(ErrorResourceKey errKey, Match errToken = null, bool context = false) + { + if (errToken == null) + { + errToken = token; + } + + if (context) + { + const int contextLength = 8; + var tokenEnd = errToken.Index + errToken.Length; + var found = tokenEnd >= contextLength ? "..." + regexPattern.Substring(tokenEnd - contextLength, contextLength) : regexPattern.Substring(0, tokenEnd); + errors.EnsureError(regExNode, errKey, found); + } + else + { + errors.EnsureError(regExNode, errKey, errToken.Value); + } + } + + if (token.Groups["newline"].Success) + { + openPoundComment = false; + } + else if (openInlineComment && (token.Groups["closeParen"].Success || token.Groups["goodEscape"].Value == "\\)")) + { + openInlineComment = false; + } + else if (!openPoundComment && !openInlineComment) + { + if (token.Groups["goodEscape"].Success || token.Groups["goodQuantifiers"].Success || token.Groups["goodExact"].Success || token.Groups["goodLimited"].Success || token.Groups["goodEscapeOutsideCC"].Success || token.Groups["goodEscapeOutsideAndInsideCCIfPositive"].Success) + { + // all is well, nothing to do + } + else if (token.Groups["characterClass"].Success) + { + bool characterClassNegative = token.Groups["characterClass"].Value[0] == '^'; + string ccString = characterClassNegative ? token.Groups["characterClass"].Value.Substring(1) : token.Groups["characterClass"].Value; + + foreach (Match ccToken in characterClassRE.Matches(ccString)) + { + void CCRegExError(ErrorResourceKey errKey) + { + RegExError(errKey, errToken: ccToken); + } + + if (ccToken.Groups["goodEscape"].Success || ccToken.Groups["goodEscapeInsideCCOnly"].Success) + { + // all good, nothing to do + } + else if (ccToken.Groups["goodEscapeOutsideAndInsideCCIfPositive"].Success) + { + if (characterClassNegative) + { + CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideNegativeCharacterClass); + return false; + } + } + else if (ccToken.Groups["goodUEscape"].Success) + { + if (ccToken.Groups["goodUEscape"].Value == "P" && characterClassNegative) + { + // would be problematic for us to allow this if we wanted to implement MatchOptions.LocaleAware in the future + CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideNegativeCharacterClass); + return false; + } + + if (!UnicodeCategories.Contains(ccToken.Groups["UCategory"].Value)) + { + CCRegExError(TexlStrings.ErrInvalidRegExBadUnicodeCategory); + return false; + } + } + else if (ccToken.Groups["badEscape"].Success) + { + CCRegExError(TexlStrings.ErrInvalidRegExBadEscape); + return false; + } + else if (ccToken.Groups["goodEscapeOutsideCC"].Success || ccToken.Groups["backRefName"].Success || ccToken.Groups["backRefNumber"].Success) + { + CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideCharacterClass); + return false; + } + else if (ccToken.Groups["badOctal"].Success) + { + CCRegExError(TexlStrings.ErrInvalidRegExBadOctal); + return false; + } + else if (ccToken.Groups["badInCharClass"].Success) + { + CCRegExError(TexlStrings.ErrInvalidRegExUnescapedCharInCharacterClass); + return false; + } + else if (ccToken.Groups["badDoubleInCharClass"].Success) + { + CCRegExError(TexlStrings.ErrInvalidRegExRepeatInCharClass); + return false; + } + else if (ccToken.Groups["badHyphen"].Success) + { + // intentionally RegExError to get the whole character class as this is on the ends + RegExError(TexlStrings.ErrInvalidRegExLiteralHyphenInCharacterClass); + return false; + } + else + { + // This should never be hit. It is here in case one of the names checked doesn't match the RE, in which case running tests would hit this. + throw new NotImplementedException("Unknown character class regular expression match: CC = " + token.Value + ", ccToken = " + ccToken.Value); + } + } + } + else if (token.Groups["goodNamedCapture"].Success) + { + var namedCapture = token.Groups["goodNamedCapture"].Value; + + if (numberedCpature) + { + RegExError(TexlStrings.ErrInvalidRegExMixingNamedAndNumberedSubMatches); + return false; + } + + if (captureNames.Contains(namedCapture)) + { + RegExError(TexlStrings.ErrInvalidRegExBadNamedCaptureAlreadyExists); + return false; + } + + captureStack.Push(namedCapture); + captureNames.Add(namedCapture); + } + else if (token.Groups["goodNonCapture"].Success || token.Groups["goodLookaround"].Success) + { + captureStack.Push(null); + } + else if (token.Groups["openParen"].Success) + { + if (numberedCpature) + { + captureNumber++; + captureStack.Push(captureNumber.ToString(CultureInfo.InvariantCulture)); + } + else + { + captureStack.Push(null); + } + } + else if (token.Groups["closeParen"].Success) + { + if (captureStack.Count == 0) + { + RegExError(TexlStrings.ErrInvalidRegExUnopenedCaptureGroups, context: true); + return false; + } + else + { + captureStack.Pop(); + } + } + else if (token.Groups["backRefName"].Success) + { + var backRefName = token.Groups["backRefName"].Value; + + if (numberedCpature) + { + RegExError(TexlStrings.ErrInvalidRegExMixingNamedAndNumberedSubMatches); + return false; + } + + // group isn't defined, or not defined yet + if (!captureNames.Contains(backRefName)) + { + RegExError(TexlStrings.ErrInvalidRegExBadBackRefNotDefined); + return false; + } + + // group is not closed and thus self referencing + if (captureStack.Contains(backRefName)) + { + RegExError(TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing); + return false; + } + } + else if (token.Groups["backRefNumber"].Success) + { + var backRef = token.Groups["backRefNumber"].Value; + var backRefNumber = Convert.ToInt32(backRef, CultureInfo.InvariantCulture); + + if (!numberedCpature) + { + RegExError(TexlStrings.ErrInvalidRegExNumberedSubMatchesDisabled); + return false; + } + + // back ref number has not yet been defined + if (backRefNumber < 1 || backRefNumber > captureNumber) + { + RegExError(TexlStrings.ErrInvalidRegExBadBackRefNotDefined); + return false; + } + + // group is not closed and thus self referencing + if (captureStack.Contains(backRef)) + { + RegExError(TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing); + return false; + } + } + else if (token.Groups["goodUEscape"].Success) + { + if (!UnicodeCategories.Contains(token.Groups["UCategory"].Value)) + { + RegExError(TexlStrings.ErrInvalidRegExBadUnicodeCategory); + return false; + } + } + else if (token.Groups["goodInlineOptions"].Success) + { + var inlineOptions = token.Groups["goodInlineOptions"].Value; + + if (Regex.IsMatch(inlineOptions, @"(?.).*\k")) + { + RegExError(TexlStrings.ErrInvalidRegExRepeatedInlineOption); + return false; + } + + if (inlineOptions.Contains("n") && numberedCpature) + { + RegExError(TexlStrings.ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches); + return false; + } + + if (inlineOptions.Contains("x")) + { + freeSpacing = true; + } + } + else if (token.Groups["goodInlineComment"].Success) + { + openInlineComment = true; + } + else if (token.Groups["poundComment"].Success) + { + openPoundComment = freeSpacing; + } + else if (token.Groups["badNamedCaptureName"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadNamedCaptureName); + return false; + } + else if (token.Groups["badOctal"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadOctal); + return false; + } + else if (token.Groups["badBalancing"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadBalancing); + return false; + } + else if (token.Groups["badInlineOptions"].Success) + { + RegExError(token.Groups["badInlineOptions"].Index > 0 ? TexlStrings.ErrInvalidRegExInlineOptionNotAtStart : TexlStrings.ErrInvalidRegExBadInlineOptions); + return false; + } + else if (token.Groups["badSingleQuoteNamedCapture"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadSingleQuoteNamedCapture); + return false; + } + else if (token.Groups["badConditional"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadConditional); + return false; + } + else if (token.Groups["badEscape"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadEscape); + return false; + } + else if (token.Groups["goodEscapeInsideCCOnly"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadEscapeOutsideCharacterClass); + return false; + } + else if (token.Groups["badQuantifiers"].Success || token.Groups["badLimited"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadQuantifier); + return false; + } + else if (token.Groups["badExact"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadExactQuantifier); + return false; + } + else if (token.Groups["badCurly"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadCurly); + return false; + } + else if (token.Groups["badParen"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadParen, context: true); + return false; + } + else if (token.Groups["badSquareBrackets"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExBadSquare, context: true); + return false; + } + else if (token.Groups["badEmptyCharacterClass"].Success) + { + RegExError(TexlStrings.ErrInvalidRegExEmptyCharacterClass); + return false; + } + else + { + // This should never be hit. It is here in case one of the Groups names checked doesn't match the RE, in which case running tests would hit this. + throw new NotImplementedException("Unknown general regular expression match: " + token.Value); + } + } + } + + if (openInlineComment) + { + errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExUnclosedInlineComment); + return false; + } + + if (captureStack.Count > 0) + { + errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExUnclosedCaptureGroups); + return false; + } + + // may be modifed by inline options; we only care about x and N in the next stage + alteredOptions = (freeSpacing ? "x" : string.Empty) + (numberedCpature ? "N" : string.Empty); + + return true; } // Creates a typed result: [Match:s, Captures:*[Value:s], NamedCaptures:r[:s]] - private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, IErrorContainer errors, ref DType returnType) + private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, string alteredOptions, IErrorContainer errors, ref DType returnType) { Contracts.AssertValue(regexPattern); string prefixedRegexPattern = this._cachePrefix + regexPattern; @@ -122,7 +697,16 @@ private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, IError try { - var regex = new Regex(regexPattern); + var regexDotNetOptions = RegexOptions.None; + if (alteredOptions.Contains("x")) + { + regexDotNetOptions |= RegexOptions.IgnorePatternWhitespace; + + // In x mode, comment line endings are [\r\n], but .NET only supports \n. For our purposes here, we can just replace the \r. + regexPattern = regexPattern.Replace('\r', '\n'); + } + + var regex = new Regex(regexPattern, regexDotNetOptions); List propertyNames = new List(); bool fullMatchHidden = false, subMatchesHidden = false, startMatchHidden = false; @@ -156,7 +740,7 @@ private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, IError propertyNames.Add(new TypedName(DType.String, ColumnName_FullMatch)); } - if (!subMatchesHidden) + if (!subMatchesHidden && alteredOptions.Contains("N")) { propertyNames.Add(new TypedName(DType.CreateTable(new TypedName(DType.String, ColumnName_Value)), ColumnName_SubMatches)); } diff --git a/src/libraries/Microsoft.PowerFx.Core/Types/Enums/BuiltInEnums.cs b/src/libraries/Microsoft.PowerFx.Core/Types/Enums/BuiltInEnums.cs index e560eb2c06..86ef9c91df 100644 --- a/src/libraries/Microsoft.PowerFx.Core/Types/Enums/BuiltInEnums.cs +++ b/src/libraries/Microsoft.PowerFx.Core/Types/Enums/BuiltInEnums.cs @@ -86,7 +86,10 @@ internal static class BuiltInEnums { "Complete", "^c$" }, { "Contains", "c" }, { "IgnoreCase", "i" }, - { "Multiline", "m" } + { "Multiline", "m" }, + { "FreeSpacing", "x" }, + { "DotAll", "s" }, + { "NumberedSubMatches", "N" } }, canConcatenateStronglyTyped: true); diff --git a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryOperators.cs b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryOperators.cs index 79f1110fe5..ab477d860e 100644 --- a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryOperators.cs +++ b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryOperators.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. using System; +using System.Globalization; using System.Linq; using Microsoft.PowerFx.Core.IR; using Microsoft.PowerFx.Core.Utils; @@ -719,9 +720,9 @@ private static BooleanValue NotEqualPolymorphic(IRContext irContext, FormulaValu } // See in_SS in JScript membershipReplacementFunctions - public static Func StringInOperator(bool exact) + public static Func StringInOperator(bool exact) { - return (irContext, args) => + return (services, irContext, args) => { var left = args[0]; var right = args[1]; @@ -737,23 +738,25 @@ public static Func StringInOperator(boo var leftStr = (StringValue)left; var rightStr = (StringValue)right; - - return new BooleanValue(irContext, rightStr.Value.IndexOf(leftStr.Value, exact ? StringComparison.Ordinal : StringComparison.OrdinalIgnoreCase) >= 0); + + return new BooleanValue(irContext, services.GetService().CompareInfo.IndexOf(rightStr.Value, leftStr.Value, exact ? CompareOptions.Ordinal : CompareOptions.IgnoreCase) >= 0); }; } // Left is a scalar. Right is a single-column table. // See in_ST() - public static Func InScalarTableOperator(bool exact) + public static Func InScalarTableOperator(bool exact) { - return (irContext, args) => + return (services, irContext, args) => { var left = args[0]; - var right = args[1]; - + var right = args[1]; + + var cultureInfo = services.GetService(); + if (!exact && left is StringValue strLhs) { - left = strLhs.ToLower(); + left = new StringValue(IRContext.NotInSource(FormulaType.String), cultureInfo.TextInfo.ToLower(strLhs.Value)); } var source = (TableValue)right; @@ -766,7 +769,7 @@ public static Func InScalarTableOperato if (!exact && rhs is StringValue strRhs) { - rhs = strRhs.ToLower(); + rhs = new StringValue(IRContext.NotInSource(FormulaType.String), cultureInfo.TextInfo.ToLower(strRhs.Value)); } if (RuntimeHelpers.AreEqual(left, rhs)) diff --git a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryRegEx.cs b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryRegEx.cs index 0be6b17bae..18dbe56950 100644 --- a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryRegEx.cs +++ b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryRegEx.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Net.NetworkInformation; +using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; @@ -21,14 +23,6 @@ internal static partial class Library { // https://learn.microsoft.com/en-us/power-platform/power-fx/reference/function-ismatch - public const string FULLMATCH = "FullMatch"; - public const string STARTMATCH = "StartMatch"; - public const string SUBMATCHES = "SubMatches"; - - private const string DefaultIsMatchOptions = "^c$"; - private const string DefaultMatchOptions = "c"; - private const string DefaultMatchAllOptions = "c"; - /// /// Creates instances of the [Is]Match[All] functions and returns them so they can be added to the runtime. /// @@ -59,16 +53,17 @@ internal class IsMatchImplementation : RegexCommonImplementation { private readonly TimeSpan _regexTimeout; - protected override string RegexOptions => DefaultIsMatchOptions; + protected override string DefaultRegexOptions => DefaultIsMatchOptions; public IsMatchImplementation(TimeSpan regexTimeout) { _regexTimeout = regexTimeout; } - protected override FormulaValue InvokeRegexFunction(string input, string regex, RegexOptions options) + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) { - Regex rex = new Regex(regex, options, _regexTimeout); + var (regexAltered, regexOptions) = AlterRegex_DotNet(regex, options); + Regex rex = new Regex(regexAltered, regexOptions, _regexTimeout); bool b = rex.IsMatch(input); return new BooleanValue(IRContext.NotInSource(FormulaType.Boolean), b); @@ -79,24 +74,25 @@ internal class MatchImplementation : RegexCommonImplementation { private readonly TimeSpan _regexTimeout; - protected override string RegexOptions => DefaultMatchOptions; + protected override string DefaultRegexOptions => DefaultMatchOptions; public MatchImplementation(TimeSpan regexTimeout) { _regexTimeout = regexTimeout; } - protected override FormulaValue InvokeRegexFunction(string input, string regex, RegexOptions options) + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) { - Regex rex = new Regex(regex, options, _regexTimeout); + var (regexAltered, regexOptions) = AlterRegex_DotNet(regex, options); + Regex rex = new Regex(regexAltered, regexOptions, _regexTimeout); Match m = rex.Match(input); if (!m.Success) { - return new BlankValue(IRContext.NotInSource(new KnownRecordType(GetRecordTypeFromRegularExpression(regex)))); + return new BlankValue(IRContext.NotInSource(new KnownRecordType(GetRecordTypeFromRegularExpression(regexAltered, regexOptions)))); } - return GetRecordFromMatch(rex, m); + return GetRecordFromMatch(rex, m, regexOptions); } } @@ -104,101 +100,42 @@ internal class MatchAllImplementation : RegexCommonImplementation { private readonly TimeSpan _regexTimeout; - protected override string RegexOptions => DefaultMatchAllOptions; + protected override string DefaultRegexOptions => DefaultMatchAllOptions; public MatchAllImplementation(TimeSpan regexTimeout) { _regexTimeout = regexTimeout; } - protected override FormulaValue InvokeRegexFunction(string input, string regex, RegexOptions options) + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) { - Regex rex = new Regex(regex, options, _regexTimeout); + var (regexAltered, regexOptions) = AlterRegex_DotNet(regex, options); + Regex rex = new Regex(regexAltered, regexOptions, _regexTimeout); MatchCollection mc = rex.Matches(input); List records = new (); foreach (Match m in mc) { - records.Add(GetRecordFromMatch(rex, m)); + records.Add(GetRecordFromMatch(rex, m, regexOptions)); } - return TableValue.NewTable(new KnownRecordType(GetRecordTypeFromRegularExpression(regex)), records.ToArray()); + return TableValue.NewTable(new KnownRecordType(GetRecordTypeFromRegularExpression(regexAltered, regexOptions)), records.ToArray()); } } - private static RecordValue GetRecordFromMatch(Regex rex, Match m) - { - Dictionary fields = new () - { - { FULLMATCH, new NamedValue(FULLMATCH, StringValue.New(m.Value)) }, - { STARTMATCH, new NamedValue(STARTMATCH, NumberValue.New((double)m.Index + 1)) } - }; - - List subMatches = new List(); - string[] groupNames = rex.GetGroupNames(); - - for (int i = 0; i < groupNames.Length; i++) - { - string groupName = groupNames[i]; - string validName = DName.MakeValid(groupName, out _).Value; - Group g = m.Groups[i]; - - if (!int.TryParse(groupName, out _)) - { - if (!fields.ContainsKey(validName)) - { - fields.Add(validName, new NamedValue(validName, StringValue.New(g.Value))); - } - else - { - fields[validName] = new NamedValue(validName, StringValue.New(g.Value)); - } - } - - if (i > 0) - { - subMatches.Add(g.Value); - } - } - - if (!fields.ContainsKey(SUBMATCHES)) - { - fields.Add(SUBMATCHES, new NamedValue(SUBMATCHES, TableValue.NewSingleColumnTable(subMatches.Select(s => StringValue.New(s)).ToArray()))); - } - - return RecordValue.NewRecordFromFields(fields.Values); - } - - private static DType GetRecordTypeFromRegularExpression(string regularExpression) + internal abstract class RegexCommonImplementation : IAsyncTexlFunction { - Dictionary propertyNames = new (); - Regex rex = new Regex(regularExpression); - - propertyNames.Add(FULLMATCH, new TypedName(DType.String, new DName(FULLMATCH))); - propertyNames.Add(STARTMATCH, new TypedName(DType.Number, new DName(STARTMATCH))); - propertyNames.Add(SUBMATCHES, new TypedName(DType.CreateTable(new TypedName(DType.String, new DName(TexlFunction.ColumnName_ValueStr))), new DName(SUBMATCHES))); - - foreach (string groupName in rex.GetGroupNames()) - { - if (!int.TryParse(groupName, out _)) - { - DName validName = DName.MakeValid(groupName, out _); - - if (!propertyNames.ContainsKey(validName.Value)) - { - propertyNames.Add(validName.Value, new TypedName(DType.String, validName)); - } - } - } + internal abstract FormulaValue InvokeRegexFunction(string input, string regex, string options); - return DType.CreateRecord(propertyNames.Values); - } + protected abstract string DefaultRegexOptions { get; } - internal abstract class RegexCommonImplementation : IAsyncTexlFunction - { - protected abstract FormulaValue InvokeRegexFunction(string input, string regex, RegexOptions options); + protected const string FULLMATCH = "FullMatch"; + protected const string STARTMATCH = "StartMatch"; + protected const string SUBMATCHES = "SubMatches"; - protected abstract string RegexOptions { get; } + protected const string DefaultIsMatchOptions = "^c$"; + protected const string DefaultMatchOptions = "c"; + protected const string DefaultMatchAllOptions = "c"; public Task InvokeAsync(FormulaValue[] args, CancellationToken cancellationToken) { @@ -241,34 +178,12 @@ public Task InvokeAsync(FormulaValue[] args, CancellationToken can } else { - matchOptions = RegexOptions; - } - - RegexOptions regOptions = System.Text.RegularExpressions.RegexOptions.CultureInvariant; - - if (matchOptions.Contains("i")) - { - regOptions |= System.Text.RegularExpressions.RegexOptions.IgnoreCase; - } - - if (matchOptions.Contains("m")) - { - regOptions |= System.Text.RegularExpressions.RegexOptions.Multiline; - } - - if (matchOptions.Contains("^") && !regularExpression.StartsWith("^", StringComparison.Ordinal)) - { - regularExpression = "^" + regularExpression; - } - - if (matchOptions.Contains("$") && !regularExpression.EndsWith("$", StringComparison.Ordinal)) - { - regularExpression += "$"; + matchOptions = DefaultRegexOptions; } try { - return Task.FromResult(InvokeRegexFunction(inputString, regularExpression, regOptions)); + return Task.FromResult(InvokeRegexFunction(inputString, regularExpression, matchOptions)); } catch (RegexMatchTimeoutException rexTimeoutEx) { @@ -298,6 +213,206 @@ public Task InvokeAsync(FormulaValue[] args, CancellationToken can #pragma warning restore SA1119 // Statement should not use unnecessary parenthesis } + + protected (string, RegexOptions) AlterRegex_DotNet(string regex, string options) + { + var altered = new StringBuilder(); + bool openCharacterClass = false; // are we defining a character class? + int index = 0; + + Match inlineOptions = Regex.Match(regex, @"\A\(\?([imnsx]+)\)"); + if (inlineOptions.Success) + { + options = options + inlineOptions.Groups[1]; + index = inlineOptions.Length; + } + + bool freeSpacing = options.Contains("x"); + bool multiline = options.Contains("m"); + bool ignoreCase = options.Contains("i"); + bool dotAll = options.Contains("s"); + bool matchStart = options.Contains("^"); + bool matchEnd = options.Contains("$"); + bool numberedSubMatches = options.Contains("N"); + + // Can't add options ^ and $ too early as there may be freespacing comments, centalize the logic here and call subfunctions + string AlterStart() + { + // ^ doesn't require any translation if not in multilline, only matches the start of the string + // MatchAll( "1a3" & Char(13) & "2b4", "(?m)^\d" ) would not match "2" without translation + return openCharacterClass ? "^" : (multiline ? @"(?<=\A|\r\n|\r|\n)" : "^"); + } + + string AlterEnd() + { + // $ does require translation if not in multilline, as $ does look past newlines to the end in .NET but it doesn't take into account \r + // MatchAll( "1a3" & Char(13) & "2b4" & Char(13), "(?m)\d$" ) would not match "3" or "4" without translation + // Match( "1a3" & Char(13), "\d$" ) would also not match "3" without translation + return openCharacterClass ? "$" : (multiline ? @"(?=\z|\r\n|\r|\n)" : @"(?=\z|\r\n\z|\r\z|\n\z)"); + } + + for (; index < regex.Length; index++) + { + switch (regex[index]) + { + case '[': + openCharacterClass = true; + altered.Append('['); + break; + + case ']': + openCharacterClass = false; + altered.Append(']'); + break; + + case '#': + if (freeSpacing && !openCharacterClass) + { + for (index++; index < regex.Length && regex[index] != '\r' && regex[index] != '\n'; index++) + { + // skip the comment characters until the next newline, in case it includes [ ] + } + + // need something to be emitted to avoid "\1#" & Char(10) & "1" being interpreted as "\11" + // need to replace a \r ending comment (supported by Power Fx) with a \n ending comment (supported by .NET) + // also need to make sure the comment terminates with a newline in case we add a "$" below + altered.Append("\n"); + } + else + { + altered.Append('#'); + } + + break; + + case '(': + // inline comment + if (regex.Length - index > 2 && regex[index + 1] == '?' && regex[index + 2] == '#') + { + for (index++; index < regex.Length && regex[index] != ')'; index++) + { + // skip the comment characters until the next closing paren, in case it includes [ ] + } + + // need something to be emitted to avoid "\1(?#)1" being interpreted as "\11" + altered.Append("(?#)"); + } + else + { + altered.Append(regex[index]); + } + + break; + + case '\\': + altered.Append("\\"); + if (++index < regex.Length) + { + altered.Append(regex[index]); + } + + break; + + case '.': + altered.Append(!openCharacterClass && !dotAll ? @"[^\r\n]" : "."); + break; + + case '^': + altered.Append(AlterStart()); + break; + + case '$': + altered.Append(AlterEnd()); + break; + + default: + altered.Append(regex[index]); + break; + } + } + + RegexOptions alteredOptions = RegexOptions.CultureInvariant | + (multiline ? RegexOptions.Multiline : 0) | + (ignoreCase ? RegexOptions.IgnoreCase : 0) | + (dotAll ? RegexOptions.Singleline : 0) | + (freeSpacing ? RegexOptions.IgnorePatternWhitespace : 0) | + (numberedSubMatches ? 0 : RegexOptions.ExplicitCapture); + + return ((matchStart ? AlterStart() : string.Empty) + altered.ToString() + (matchEnd ? AlterEnd() : string.Empty), alteredOptions); + } + + protected static RecordValue GetRecordFromMatch(Regex rex, Match m, RegexOptions options) + { + Dictionary fields = new () + { + { FULLMATCH, new NamedValue(FULLMATCH, StringValue.New(m.Value)) }, + { STARTMATCH, new NamedValue(STARTMATCH, NumberValue.New((double)m.Index + 1)) } + }; + + List subMatches = new List(); + string[] groupNames = rex.GetGroupNames(); + + for (int i = 0; i < groupNames.Length; i++) + { + string groupName = groupNames[i]; + string validName = DName.MakeValid(groupName, out _).Value; + Group g = m.Groups[i]; + FormulaValue val = g.Success ? StringValue.New(g.Value) : BlankValue.NewBlank(FormulaType.String); + + if (!int.TryParse(groupName, out _)) + { + if (!fields.ContainsKey(validName)) + { + fields.Add(validName, new NamedValue(validName, val)); + } + else + { + fields[validName] = new NamedValue(validName, val); + } + } + + if (i > 0) + { + subMatches.Add(FormulaValue.NewRecordFromFields(new NamedValue(TableValue.ValueName, val))); + } + } + + if (!fields.ContainsKey(SUBMATCHES) && (options & RegexOptions.ExplicitCapture) == 0) + { + var recordType = RecordType.Empty().Add(TableValue.ValueName, FormulaType.String); + fields.Add(SUBMATCHES, new NamedValue(SUBMATCHES, TableValue.NewTable(recordType, subMatches))); + } + + return RecordValue.NewRecordFromFields(fields.Values); + } + + protected static DType GetRecordTypeFromRegularExpression(string regularExpression, RegexOptions regularExpressionOptions) + { + Dictionary propertyNames = new (); + Regex rex = new Regex(regularExpression, regularExpressionOptions); + + propertyNames.Add(FULLMATCH, new TypedName(DType.String, new DName(FULLMATCH))); + propertyNames.Add(STARTMATCH, new TypedName(DType.Number, new DName(STARTMATCH))); + if ((regularExpressionOptions & RegexOptions.ExplicitCapture) == 0) + { + propertyNames.Add(SUBMATCHES, new TypedName(DType.CreateTable(new TypedName(DType.String, new DName(TexlFunction.ColumnName_ValueStr))), new DName(SUBMATCHES))); + } + + foreach (string groupName in rex.GetGroupNames()) + { + if (!int.TryParse(groupName, out _)) + { + DName validName = DName.MakeValid(groupName, out _); + + if (!propertyNames.ContainsKey(validName.Value)) + { + propertyNames.Add(validName.Value, new TypedName(DType.String, validName)); + } + } + } + + return DType.CreateRecord(propertyNames.Values); + } } } } diff --git a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryTable.cs b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryTable.cs index e3896aa133..1b37eb4f1b 100644 --- a/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryTable.cs +++ b/src/libraries/Microsoft.PowerFx.Interpreter/Functions/LibraryTable.cs @@ -882,31 +882,31 @@ public static async ValueTask SortTable(EvalVisitor runner, EvalVi if (allNumbers) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allDecimals) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allStrings) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allBooleans) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allDatetimes) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allDates) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allTimes) { - return SortValueType(pairs, irContext, compareToResultModifier); + return SortValueType(pairs, runner, irContext, compareToResultModifier); } else if (allOptionSets) { @@ -1333,7 +1333,7 @@ private static FormulaValue DistinctValueType(List<(DValue row, For return new InMemoryTableValue(irContext, result); } - private static FormulaValue SortValueType(List<(DValue row, FormulaValue sortValue)> pairs, IRContext irContext, int compareToResultModifier) + private static FormulaValue SortValueType(List<(DValue row, FormulaValue sortValue)> pairs, EvalVisitor runner, IRContext irContext, int compareToResultModifier) where TPFxPrimitive : PrimitiveValue where TDotNetPrimitive : IComparable { @@ -1349,8 +1349,16 @@ private static FormulaValue SortValueType(List< } var n1 = a.sortValue as TPFxPrimitive; - var n2 = b.sortValue as TPFxPrimitive; - return n1.Value.CompareTo(n2.Value) * compareToResultModifier; + var n2 = b.sortValue as TPFxPrimitive; + CultureInfo culture; + if (n1.Value is string n1s && n2.Value is string n2s && (culture = runner.GetService()) != null) + { + return culture.CompareInfo.Compare(n1s, n2s) * compareToResultModifier; + } + else + { + return n1.Value.CompareTo(n2.Value) * compareToResultModifier; + } }); return new InMemoryTableValue(irContext, pairs.Select(pair => pair.row)); diff --git a/src/libraries/Microsoft.PowerFx.Repl/Services/MultilineProcessor.cs b/src/libraries/Microsoft.PowerFx.Repl/Services/MultilineProcessor.cs index 0567995f03..d0a80982e3 100644 --- a/src/libraries/Microsoft.PowerFx.Repl/Services/MultilineProcessor.cs +++ b/src/libraries/Microsoft.PowerFx.Repl/Services/MultilineProcessor.cs @@ -7,6 +7,10 @@ using System.Linq.Expressions; using System.Text; using System.Text.RegularExpressions; +using Microsoft.CodeAnalysis; +using Microsoft.CodeAnalysis.CodeActions; +using Microsoft.CodeAnalysis.CSharp.Syntax; +using Microsoft.CodeAnalysis.FlowAnalysis; using Microsoft.PowerFx.Core.Localization; using Microsoft.PowerFx.Core.Utils; using Microsoft.PowerFx.Repl.Functions; diff --git a/src/strings/PowerFxResources.en-US.resx b/src/strings/PowerFxResources.en-US.resx index a01b3fcbdf..a5f92f2c61 100644 --- a/src/strings/PowerFxResources.en-US.resx +++ b/src/strings/PowerFxResources.en-US.resx @@ -4515,8 +4515,140 @@ Invalid regular expression. Error message indicating that the regular expression entered by the user is invalid. + + Invalid regular expression: Inline options must appear at the beginning of the regular expression, found "{0}" later. + Error message indicating that the regular expression has inline options not at the front. + + + Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "{0}". + Error message indicating that the regular expression has bad inline options. + + + Invalid regular expression: Octal \0 character codes are not supported, use hexadecimal \x or Unicode \u instead, found "{0}". + Error message indicating that the regular expression has octal characters. + + + Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{0}". + Error message indicating that the regular expression has literal curly braces. + + + Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "{0}". + Error message indicating that the regular expression has literal square braces. + + + Invalid regular expression: Unsupported special group, found "{0}". + Error message indicating that the regular expression that starts with a paren. + + + Invalid regular expression: Escape character not permitted within character class, found "{0}". + Error message indicating that the regular expression has an escape character within a character class. + + + Invalid regular expression: Negative escape character not permitted within negated character class, found "{0}". + Error message indicating that the regular expression has an escape character within a negated character class. + + + Invalid regular expression: Escape character not permitted outside a character class, found "{0}". + Error message indicating that the regular expression has an escape character outside a character class. + + + Invalid regular expression: Invalid Unicode category name, found "{0}". + Error message indicating that the regular expression has an invalid Unicode caategory name. + + + Invalid regular expression: Repeated inline option, found "{0}". + Error message indicating that the regular expression includes more than one of the same option. + + + Invalid regular expression: Unclosed inline comment, starts with "(?#...". + Error message indicating that the regular expression includes an unclosed inline comment. + + + Invalid regular expression: Inline option is incompatible with MatchOptions.NumberedSubMatches, found "{0}". + Error message indicating that the regular expression includes an inline option that is incompatible with numbered sub matches. + + + Invalid regular expression: Possessive quantifiers are not supported, found "{0}". + Error message indicating that the regular expression does not support possessive quantifiers. + + + Invalid regular expression: Exact quantifiers cannot be used with quantifier modifiers such as ? for lazy, found "{0}". + Error message indicating that the regular expression does not support modifiers for exact quantifiers. + + + Invalid regular expression: Invalid escape code, found "{0}". + Error message indicating that the regular expression has an invalid alphanumeric escape code. + + + Invalid regular expression: Capture group "{0}" not defined. + Error message indicating that the regular expression capture group is not defined. + + + Invalid regular expression: Self-referencing capture groups are not supported, found "{0}". + Error message indicating that the regular expression has self-referencing capture groups. + + + Invalid regular expression: Unclosed groups, too few closing parenthesis. + Error message indicating that the regular expression has unclosed capture groups. + + + Invalid regular expression: Unopened groups, too few opening parenthesis. + Error message indicating that the regular expression has unclosed capture groups. + + + Invalid regular expression: Named capture group "{0}" defined more than once. + Error message indicating that the regular expression is trying to define the same capture group more than once. + + + Invalid regular expression: Named capture group name must be a combination of letters and digits and begin with a letter, found "{0}". + Error message indicating that the regular expression is trying to define the same capture group more than once. + + + Invalid regular expression: Inline options must appear at the beginning of the regular expression, found "{0}" later. + Error message indicating that the regular expression has inline options not at the front. + + + Invalid regular expression: Character appears more than once in character class, found repeated "{0}". + Error message indicating that the regular expression has repeated characters in a character class definition. + + + Invalid regular expression: Square bracket character classes cannot be empty, found "{0}". + Error message indicating that the regular expression character class is empty. + + + Invalid regular expression: Balancing capture groups are not supported, found "{0}". + Error message indicating that the regular expression has balancing capture groups. + + + Invalid regular expression: Using single quoted named captures is not supported, use (?<...>) syntax instead, found "{0}". + Error message indicating that the regular expression has single quoted named capture. + + + Invalid regular expression: Conditional alternation is not supported, found "{0}". + Error message indicating that the regular expression has conditionals. + + + Invalid regular expression: Named captures cannot be used with MatchOptions.NumberedSubMatches enabled, found "{0}". + Error message indicating that the regular expression is mixing named and numbered captures. + + + Invalid regular expression: Use named captures with "(?<name>...)" and "\k<name>" or enable MatchOptions.NumberedSubMatches, found "{0}". + Error message indicating that the regular expression is not enabled for numbered captures. + + + Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "{0}". + Error message indicating that the regular expression has a hyphen in a character class. + + + Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "{0}". + Error message indicating that the regular expression has an unescaped character within a character class. + - Regular expressions must be constant values. + Regular expression must be a constant value. + Error Message. + + + MatchOptions must be a constant value. Error Message. diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_en-US.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_en-US.txt new file mode 100644 index 0000000000..5934827f7e --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_en-US.txt @@ -0,0 +1,163 @@ +#SETUP: RegEx,CultureInfo("en-US"),PowerFxV1CompatibilityRules,ConsistentOneColumnTableResult,SupportColumnNamesAsIdentifiers + +// Four types of letter I +// Dotted Dotless +// Upper İ U+0130 I U+0049 +// Lower i U+0069 ı U+0131 + +>> Language() +"en-US" + +>> "İ" = UniChar( Hex2Dec( "0130") ) +true + +>> "ı" = UniChar( Hex2Dec( "0131" ) ) +true + +// UPPER, LOWER, PROPER + +>> Upper( "i" ) +"I" + +>> Lower( "I" ) +"i" + +>> Upper( "i" ) = "I" +true + +>> Lower( "I" ) = "i" +true + +>> Lower( "quit" ) = Lower( "QUIT" ) +true + +>> Lower( "quit" ) = Lower( "QUİT" ) +true + +>> Lower( "quıt" ) = Lower( "QUIT" ) +false + +>> Upper( "quit" ) = Upper( "QUIT" ) +true + +>> Proper( "Iabc" ) +"Iabc" + +>> Proper( "iabc" ) +"Iabc" + +// VALUE, DECIMAL, FLOAT + +>> Value( "123,456" ) +123456 + +>> Value( "123,456", "tr-TR" ) +123.456 + +>> Decimal( "123,456" ) +123456 + +>> Decimal( "123,456", "tr-TR" ) +123.456 + +>> Float( "123,456" ) +123456 + +>> Float( "123,456", "tr-TR" ) +123.456 + +// TEXT + +>> Text( DateTime(2010,1,1,14,0,0,0), "mmm ddd yyyy AM/PM" ) +"Jan Fri 2010 PM" + +>> Text( DateTime(2020,1,1,2,0,0,0), "mmmm dddd yyyy AM/PM" ) +"January Wednesday 2020 AM" + +>> Text( 123456789, "#,###.00" ) +"123,456,789.00" + +>> Text( 123456789, "#.###,00" ) +"123456789.00000" + +// IN AND EXACTIN + +>> "i" in "SIGH" +true + +>> "I" in "sigh" +true + +>> "i" exactin "SIGH" +false + +>> "I" exactin "sigh" +false + +>> "I" exactin "SIGH" +true + +>> "i" exactin "sigh" +true + +>> "sIGh" in ["sigh","bcde"] +true + +>> "siGh" in ["SIGH","bcde"] +true + +>> "sIGH" in ["sigh","bcde"] +true + +>> "siGH" in ["bcde","sIgh"] +true + +>> "SIgh" in ["bcde","sigh"] +true + +// REGULAR EXPRESSIONS +// Always uses invariant even though tr-TR is set, subject of https://github.com/microsoft/Power-Fx/issues/2538 + +// Results when using C# // Invariant tr-TR en-US + +>> IsMatch( "İ", "i", MatchOptions.IgnoreCase ) // false TRUE TRUE +false + +>> IsMatch( "i", "İ", MatchOptions.IgnoreCase ) // false TRUE TRUE +false + +>> IsMatch( "ı", "I", MatchOptions.IgnoreCase ) // false TRUE false +false + +>> IsMatch( "I", "ı", MatchOptions.IgnoreCase ) // false TRUE false +false + +>> IsMatch( "İ", "I", MatchOptions.IgnoreCase ) // false false TRUE +false + +>> IsMatch( "I", "İ", MatchOptions.IgnoreCase ) // false false TRUE +false + +>> IsMatch( "ı", "i", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "i", "ı", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "i", "I", MatchOptions.IgnoreCase ) // TRUE false TRUE +true + +>> IsMatch( "I", "i", MatchOptions.IgnoreCase ) // TRUE false TRUE +true + +>> IsMatch( "ı", "İ", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "İ", "ı", MatchOptions.IgnoreCase ) // false false false +false + +>> ShowColumns( Match( "hiIıİİıIhi", "\u0130+" ), FullMatch, StartMatch ) +{FullMatch:"İİ",StartMatch:5} + +>> IsMatch( "Sıgh", "\u0131", MatchOptions.Contains ) +true diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_tr-TR.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_tr-TR.txt new file mode 100644 index 0000000000..d3aed5eec7 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Culture_tr-TR.txt @@ -0,0 +1,193 @@ +#SETUP: RegEx,CultureInfo("tr-TR"),PowerFxV1CompatibilityRules,ConsistentOneColumnTableResult,SupportColumnNamesAsIdentifiers + +// Four types of letter I +// Dotted Dotless +// Upper İ U+0130 I U+0049 +// Lower i U+0069 ı U+0131 + +>> Language() +"tr-TR" + +>> "İ" = UniChar( Hex2Dec( "0130") ) +true + +>> "ı" = UniChar( Hex2Dec( "0131" ) ) +true + +// UPPER, LOWER, PROPER + +>> Upper( "i" ) +"İ" + +>> Lower( "I" ) +"ı" + +>> Upper( "ı" ) +"I" + +>> Lower( "İ" ) +"i" + +>> Upper( "i" ) = UniChar( Hex2Dec( "0130") ) +true + +>> Lower( "I" ) = UniChar( Hex2Dec( "0131") ) +true + +>> Upper( "i" ) = "I" +false + +>> Lower( "I" ) = "i" +false + +>> Lower( "quit" ) = Lower( "QUIT" ) +false + +>> Lower( "quit" ) = Lower( "QUİT" ) +true + +>> Lower( "quıt" ) = Lower( "QUIT" ) +true + +>> Upper( "quit" ) = Upper( "QUIT" ) +false + +>> Upper( "quit" ) = Upper( "QUİT" ) +true + +>> Upper( "quıt" ) = Upper( "QUIT" ) +true + +>> Proper( "Iabc" ) +"Iabc" + +>> Proper( "iabc" ) +"İabc" + +>> Proper( "İabc" ) +"İabc" + +>> Proper( "ıabc" ) +"Iabc" + +// VALUE, DECIMAL, FLOAT + +>> Value( "123,456" ) +123.456 + +>> Value( "123,456", "en-US" ) +123456 + +>> Decimal( "123,456" ) +123.456 + +>> Decimal( "123,456", "en-US" ) +123456 + +>> Float( "123,456" ) +123.456 + +>> Float( "123,456", "en-US" ) +123456 + +// TEXT + +>> Text( DateTime(2010,1,1,14,0,0,0), "mmm ddd yyyy AM/PM" ) +"Oca Cum 2010 ÖS" + +>> Text( DateTime(2020,1,1,2,0,0,0), "mmmm dddd yyyy AM/PM" ) +"Ocak Çarşamba 2020 ÖÖ" + +>> Text( 123456789, "#,###.00" ) +"123456789,00000" + +>> Text( 123456789, "#.###,00" ) +"123.456.789,00" + +// IN AND EXACTIN + +>> "ı" in "SIGH" +true + +>> "İ" in "sigh" +true + +>> "ı" in "SİGH" +false + +>> "İ" in "sıgh" +false + +>> "ı" exactin "SIGH" +false + +>> "İ" exactin "sigh" +false + +>> "ı" exactin "SİGH" +false + +>> "İ" exactin "sıgh" +false + +>> "sİGh" in ["sigh","bcde"] +true + +>> "siGh" in ["SİGH","bcde"] +true + +>> "sIGH" in ["sigh","bcde"] +false + +>> "sıGH" in ["bcde","sIgh"] +true + +>> "SIgh" in ["bcde","sıgh"] +true + +// REGULAR EXPRESSIONS +// Always uses invariant even though tr-TR is set, subject of https://github.com/microsoft/Power-Fx/issues/2538 + +// Results when using C# // Invariant tr-TR en-US + +>> IsMatch( "İ", "i", MatchOptions.IgnoreCase ) // false TRUE TRUE +false + +>> IsMatch( "i", "İ", MatchOptions.IgnoreCase ) // false TRUE TRUE +false + +>> IsMatch( "ı", "I", MatchOptions.IgnoreCase ) // false TRUE false +false + +>> IsMatch( "I", "ı", MatchOptions.IgnoreCase ) // false TRUE false +false + +>> IsMatch( "İ", "I", MatchOptions.IgnoreCase ) // false false TRUE +false + +>> IsMatch( "I", "İ", MatchOptions.IgnoreCase ) // false false TRUE +false + +>> IsMatch( "ı", "i", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "i", "ı", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "i", "I", MatchOptions.IgnoreCase ) // TRUE false TRUE +true + +>> IsMatch( "I", "i", MatchOptions.IgnoreCase ) // TRUE false TRUE +true + +>> IsMatch( "ı", "İ", MatchOptions.IgnoreCase ) // false false false +false + +>> IsMatch( "İ", "ı", MatchOptions.IgnoreCase ) // false false false +false + +>> ShowColumns( Match( "hiIıİİıIhi", "\u0130+" ), FullMatch, StartMatch ) +{FullMatch:"İİ",StartMatch:5} + +>> IsMatch( "Sıgh", "\u0131", MatchOptions.Contains ) +true diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch.txt index b35d34775f..2e99ea9b71 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch.txt @@ -49,7 +49,7 @@ Error({Kind:ErrorKind.Div0}) false >> IsMatch("Foo", Blank()) -Errors: Error 15-22: Regular expressions must be constant values. +Errors: Error 15-22: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. >> IsMatch(28, "Bar") false @@ -61,7 +61,7 @@ true false >> IsMatch(Blank(), Blank()) -Errors: Error 17-24: Regular expressions must be constant values. +Errors: Error 17-24: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. >> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` A 1234567890", "\p{L}") false @@ -73,37 +73,51 @@ false >> IsMatch("!@#$%^&*()-=_+<>,.:;\'{}[]\|?/~` Ð 1234567890", "\p{L}") false ->> IsMatch("Foo", "J(") -Error({Kind:ErrorKind.BadRegex}) - -// Dangerous Regex, will timeout (should take >2h on a fast CPU) ->> IsMatch("ababababababababababababababababababababababababababababababababababa", "^((ab)*)+$") -Error({Kind:ErrorKind.Timeout}) - >> IsMatch( "28", Concat( [2,8], Value ) ) -Errors: Error 15-37: Regular expressions must be constant values. +Errors: Error 15-37: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. >> IsMatch( "28", Concat( [2,8], If( false, Text(Now()), Value ) ) ) -Errors: Error 15-63: Regular expressions must be constant values. +Errors: Error 15-63: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. ->> IsMatch("(555) 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +>> IsMatch("(555) 123-4567", "^[\+]?[\(]?[0-9]{3}[\)]?[\-\s\.]?[0-9]{3}[\-\s\.]?[0-9]{4,6}$") true ->> IsMatch("(555)_123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +>> IsMatch("(555)_123-4567", "^[\+]?[\(]?[0-9]{3}[\)]?[\-\s\.]?[0-9]{3}[\-\s\.]?[0-9]{4,6}$") false ->> IsMatch("147 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +>> IsMatch("147 123-4567", "^[\+]?[\(]?[0-9]{3}[\)]?[\-\s\.]?[0-9]{3}[\-\s\.]?[0-9]{4,6}$") true ->> IsMatch("(44) 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +>> IsMatch("(44) 123-4567", "^[\+]?[\(]?[0-9]{3}[\)]?[\-\s\.]?[0-9]{3}[\-\s\.]?[0-9]{4,6}$") false ->> IsMatch("""Hello world""", Char(34) & "Hello", MatchOptions.Contains) +>> IsMatch("""Hello world""", Char(34) & "Hello", MatchOptions.Contains) true ->> IsMatch("Hello 123 world", $"Hello {Sqrt(1)}{Sqrt(4)}{Sqrt(9)} world") +>> IsMatch("""Hello world""", UniChar(34) & "Hello", MatchOptions.Contains) +true + +>> IsMatch("👽Hello world", UniChar(Hex2Dec("1F47D")) & "Hello", MatchOptions.Contains) +Errors: Error 51-52: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch("👽Hello world", UniChar(128125) & "Hello", MatchOptions.Contains) +true + +>> IsMatch("👽Hello world", "\ud83d\udc7dHello", MatchOptions.Contains) // surrrogate pairs +true + +>> IsMatch(UniChar(Hex2Dec("1f47d")) & "Hello world", UniChar(128125) & "Hello", MatchOptions.Contains) true +>> IsMatch("""Hello world""", Mid( "Hello", 1 ), MatchOptions.Contains) +Errors: Error 27-44: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch("Hello 123 world", $"Hello {Sqrt(1)}{Sqrt(4)}{Sqrt(9)} world") +Errors: Error 27-69: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch("Hello 123 world", $"Hello") +Errors: Error 27-35: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. + >> IsMatch("Hello", "Hello", MatchOptions.IgnoreCase) true diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_StronglyTypedEnumsDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_StronglyTypedEnumsDisabled.txt index b590518ad1..eaf657eb7f 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_StronglyTypedEnumsDisabled.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_StronglyTypedEnumsDisabled.txt @@ -47,13 +47,13 @@ false true >> IsMatch("Foo", 17) -Errors: Error 15-17: Regular expressions must be constant values. +Errors: Error 15-17: Regular expression must be a constant value. >> IsMatch("Foo", 1/0) -Errors: Error 16-17: Regular expressions must be constant values. +Errors: Error 16-17: Regular expression must be a constant value. >> IsMatch("28", 28) -Errors: Error 14-16: Regular expressions must be constant values. +Errors: Error 14-16: Regular expression must be a constant value. >> IsMatch("Hello", "Hello", "") true diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1Compat.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1Compat.txt new file mode 100644 index 0000000000..905044158b --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1Compat.txt @@ -0,0 +1,7 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules + +>> IsMatch("Foo", "J(") +Errors: Error 15-19: Invalid regular expression: Unclosed groups, too few closing parenthesis.|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch("""Hello world""", "\w+", If( Sqrt(4) > 0, MatchOptions.Contains, MatchOptions.Complete)) +Errors: Error 34-96: MatchOptions must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1CompatDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1CompatDisabled.txt new file mode 100644 index 0000000000..f4c31860e3 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/IsMatch_V1CompatDisabled.txt @@ -0,0 +1,4 @@ +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +>> IsMatch("Foo", "J(") +Error({Kind:ErrorKind.BadRegex}) diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match.txt index f88891561c..4a11555b54 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match.txt @@ -1,14 +1,11 @@ #SETUP: RegEx ->> Match("Hello", "\w").FullMatch -"H" - ->> Match("Hello", "\w") -{FullMatch:"H",StartMatch:1,SubMatches:Table()} - >> Match("Hello", "(?\w)") {FullMatch:"H",StartMatch:1,SubMatches:"H"} +>> Match("Hello", "\w").FullMatch +"H" + >> Match("Hello", "\w").StartMatch 1 @@ -24,9 +21,6 @@ Blank() >> Match("Hello", "llo", MatchOptions.Complete).StartMatch Blank() ->> Match("Hello", "llo", MatchOptions.Complete).SubMatches -Blank() - >> Match("Bob Jones ", "<(?" & Match.Email & ")>").email "bob.jones@contoso.com" @@ -42,9 +36,6 @@ Blank() >> Match( "Bob Jones ", "<(?" & Match.Email & ")>").StartMatch 11 ->> Concat(ForAll(Match( "Bob Jones ", "<(?" & Match.Email & ")>").SubMatches, With({x:Value}, x)), Value, ", ") -"bob.jones@contoso.com" - >> Match("Hello", "(?\w)l(?\w)").FullMatch "ell" @@ -57,51 +48,23 @@ Blank() >> Match("Hello", "(?\w)l(?\w)").p2 "l" ->> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 1).Value -"e" - ->> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 2).Value -"l" - ->> Concat(ForAll(Match("Hello", "(?\w)l(?\w)").SubMatches, With({x:Value}, x)), Value, ", ") -"e, l" - >> With(Match("PT2H1M39S", "PT(?:(?\d+)H)?(?:(?\d+)M)?(?:(?\d+)S)?"), Time(Value(hours), Value(minutes), Value(seconds))) Time(2,1,39,0) ->> Match("Hello", "(?\w)l(?\w)").SubMatches -Table({Value:"e"},{Value:"l"}) - >> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete) Blank() ->> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) -{FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})} - ->> Match("JohnDoe@microsoft.com", Match.Email) -{FullMatch:"JohnDoe@microsoft.com",StartMatch:1,SubMatches:Table()} - >> Match(Blank(), ".") Blank() >> Match(Blank(), Blank()) -Errors: Error 15-22: Regular expressions must be constant values.|Error 0-5: The function 'Match' has some invalid arguments. +Errors: Error 15-22: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. >> Match("28", 28) -Errors: Error 12-14: Regular expressions must be constant values.|Error 0-5: The function 'Match' has some invalid arguments. +Errors: Error 12-14: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. >> Match(1/0, "Hi") Error({Kind:ErrorKind.Div0}) >> Match("Hello", Right("llo", 3)).FullMatch -Errors: Error 15-30: Regular expressions must be constant values.|Error 0-5: The function 'Match' has some invalid arguments.|Error 31-41: Name isn't valid. 'FullMatch' isn't recognized. - ->> Match("(555) 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") -{FullMatch:"(555) 123-4567",StartMatch:1,SubMatches:Table()} - ->> Match("Hello", "Hello", MatchOptions.IgnoreCase) -{FullMatch:"Hello",StartMatch:1,SubMatches:Table()} - ->> Match("Hi", "Hi", MatchOptions.Multiline) -{FullMatch:"Hi",StartMatch:1,SubMatches:Table()} - +Errors: Error 15-30: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments.|Error 31-41: Name isn't valid. 'FullMatch' isn't recognized. diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll.txt index 915d86714a..8d1a0822eb 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll.txt @@ -1,34 +1,13 @@ #SETUP: RegEx ->> MatchAll("Hello", "\w") -Table({FullMatch:"H",StartMatch:1,SubMatches:Table()},{FullMatch:"e",StartMatch:2,SubMatches:Table()},{FullMatch:"l",StartMatch:3,SubMatches:Table()},{FullMatch:"l",StartMatch:4,SubMatches:Table()},{FullMatch:"o",StartMatch:5,SubMatches:Table()}) - >> MatchAll("Hello", "llo", MatchOptions.Complete) Table() ->> MatchAll("Bob Jones ", "<(?" & Match.Email & ")>") -Table({FullMatch:"",StartMatch:11,SubMatches:Table({Value:"bob.jones@contoso.com"}),email:"bob.jones@contoso.com"}) - ->> MatchAll("PT2H1M39S", "PT(?:(?\d+)H)?(?:(?\d+)M)?(?:(?\d+)S)?") -Table({FullMatch:"PT2H1M39S",StartMatch:1,SubMatches:Table({Value:"2"},{Value:"1"},{Value:"39"}),hours:"2",minutes:"1",seconds:"39"}) - ->> MatchAll("Hello", "(?\w)l(?\w)") -Table({FullMatch:"ell",StartMatch:2,SubMatches:Table({Value:"e"},{Value:"l"}),p1:"e",p2:"l"}) - ->> MatchAll("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) -Table({FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})},{FullMatch:"Sam 208",StartMatch:9,SubMatches:Table({Value:"Sam"},{Value:"208"})}) - >> MatchAll(Blank(), ".") Table() >> MatchAll(Blank(), Blank()) -Errors: Error 18-25: Regular expressions must be constant values.|Error 0-8: The function 'MatchAll' has some invalid arguments. +Errors: Error 18-25: Regular expression must be a constant value.|Error 0-8: The function 'MatchAll' has some invalid arguments. >> MatchAll(1/0, "Hi") Error({Kind:ErrorKind.Div0}) - ->> MatchAll("Hello", "Hello", MatchOptions.IgnoreCase) -Table({FullMatch:"Hello",StartMatch:1,SubMatches:Table()}) - ->> MatchAll("Hi", "Hi", MatchOptions.Multiline) -Table({FullMatch:"Hi",StartMatch:1,SubMatches:Table()}) diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_StronglyTypedEnumsDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_StronglyTypedEnumsDisabled.txt index 5cad17fbe4..4269439fea 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_StronglyTypedEnumsDisabled.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_StronglyTypedEnumsDisabled.txt @@ -1,4 +1,4 @@ #SETUP: RegEx,disable:StronglyTypedBuiltinEnums ->> MatchAll("Hello", "Hello", "") -Table({FullMatch:"Hello",StartMatch:1,SubMatches:Table()}) +>> ForAll( MatchAll("Helloofammasdfooerf", "(?\w)\k", ""), {fm:FullMatch} ) +Table({fm:"ll"},{fm:"oo"},{fm:"mm"},{fm:"oo"}) diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1Compat.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1Compat.txt new file mode 100644 index 0000000000..7d4e3aa8bd --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1Compat.txt @@ -0,0 +1,44 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules + +>> MatchAll("Hello", "\w") +Table({FullMatch:"H",StartMatch:1},{FullMatch:"e",StartMatch:2},{FullMatch:"l",StartMatch:3},{FullMatch:"l",StartMatch:4},{FullMatch:"o",StartMatch:5}) + +>> MatchAll("Hello", "llo", MatchOptions.Complete) +Table() + +>> MatchAll("Bob Jones ", "<(?" & Match.Email & ")>") +Table({FullMatch:"",StartMatch:11,email:"bob.jones@contoso.com"}) + +>> MatchAll("PT2H1M39S", "PT(?:(?\d+)H)?(?:(?\d+)M)?(?:(?\d+)S)?") +Table({FullMatch:"PT2H1M39S",StartMatch:1,hours:"2",minutes:"1",seconds:"39"}) + +>> MatchAll("Hello", "(?\w)l(?\w)") +Table({FullMatch:"ell",StartMatch:2,p1:"e",p2:"l"}) + +>> MatchAll("Hello", "(\w)l(\w)") +Table({FullMatch:"ell",StartMatch:2}) + +>> MatchAll("Hello", "(\w)l(\w)", MatchOptions.NumberedSubMatches) +Table({FullMatch:"ell",StartMatch:2,SubMatches:Table({Value:"e"},{Value:"l"})}) + +>> MatchAll("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline & MatchOptions.NumberedSubMatches) +Table({FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})},{FullMatch:"Sam 208",StartMatch:9,SubMatches:Table({Value:"Sam"},{Value:"208"})}) + +>> MatchAll(Blank(), ".") +Table() + +>> MatchAll(Blank(), Blank()) +Errors: Error 18-25: Regular expression must be a constant value.|Error 0-8: The function 'MatchAll' has some invalid arguments. + +>> MatchAll(1/0, "Hi") +Error({Kind:ErrorKind.Div0}) + +>> MatchAll("Hello", "Hello", MatchOptions.IgnoreCase) +Table({FullMatch:"Hello",StartMatch:1}) + +>> MatchAll("Hi", "Hi", MatchOptions.Multiline) +Table({FullMatch:"Hi",StartMatch:1}) + +>> MatchAll("28", "28", Blank()) +Errors: Error 21-28: MatchOptions must be a constant value.|Error 0-8: The function 'MatchAll' has some invalid arguments. + diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1CompatDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1CompatDisabled.txt new file mode 100644 index 0000000000..aa08769eb0 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/MatchAll_V1CompatDisabled.txt @@ -0,0 +1,22 @@ +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +>> MatchAll("Hello", "\w") +Table({FullMatch:"H",StartMatch:1,SubMatches:Table()},{FullMatch:"e",StartMatch:2,SubMatches:Table()},{FullMatch:"l",StartMatch:3,SubMatches:Table()},{FullMatch:"l",StartMatch:4,SubMatches:Table()},{FullMatch:"o",StartMatch:5,SubMatches:Table()}) + +>> MatchAll("Bob Jones ", "<(?" & Match.Email & ")>") +Table({FullMatch:"",StartMatch:11,SubMatches:Table({Value:"bob.jones@contoso.com"}),email:"bob.jones@contoso.com"}) + +>> MatchAll("PT2H1M39S", "PT(?:(?\d+)H)?(?:(?\d+)M)?(?:(?\d+)S)?") +Table({FullMatch:"PT2H1M39S",StartMatch:1,SubMatches:Table({Value:"2"},{Value:"1"},{Value:"39"}),hours:"2",minutes:"1",seconds:"39"}) + +>> MatchAll("Hello", "(?\w)l(?\w)") +Table({FullMatch:"ell",StartMatch:2,SubMatches:Table({Value:"e"},{Value:"l"}),p1:"e",p2:"l"}) + +>> MatchAll("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) +Table({FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})},{FullMatch:"Sam 208",StartMatch:9,SubMatches:Table({Value:"Sam"},{Value:"208"})}) + +>> MatchAll("Hello", "Hello", MatchOptions.IgnoreCase) +Table({FullMatch:"Hello",StartMatch:1,SubMatches:Table()}) + +>> MatchAll("Hi", "Hi", MatchOptions.Multiline) +Table({FullMatch:"Hi",StartMatch:1,SubMatches:Table()}) diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Comments.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Comments.txt new file mode 100644 index 0000000000..da2f77e754 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Comments.txt @@ -0,0 +1,304 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules,SupportColumnNamesAsIdentifiers + +// Comments and free spacing behavior in Power Fx regular expressions. +// +// Effective Usage .NET ECMAScript PCRE2 +// ===================================================================================================================================== +// (?# ...) Inline comment Yes No Yes +// "x" option space insiginificant and # comments Yes No Yes + +// INLINE COMMENTS + +>> Match( "test", "(?# this is a test)st" ) +{FullMatch:"st",StartMatch:3} + +>> Match( "test", "(?# this is a test with a " & Char(10) & " newline)st" ) +{FullMatch:"st",StartMatch:3} + +>> Match( "test", "(?# this is a test ( with an open paren )st" ) +{FullMatch:"st",StartMatch:3} + +>> Match( "aaaaa", "(?# happu ( )a" ) +{FullMatch:"a",StartMatch:1} + +>> Match( "aaaaa", "(?# happu () )a" ) +Errors: Error 16-33: Invalid regular expression: Unopened groups, too few opening parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "(?# this is a test \) with an escaped close paren )st" ) // can't escape a paren in an inline comment +Errors: Error 15-70: Invalid regular expression: Unopened groups, too few opening parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "te (?# this is a test with an unclosed comment" ) +Errors: Error 15-63: Invalid regular expression: Unclosed inline comment, starts with "(?#...".|Error 0-5: The function 'Match' has some invalid arguments. + +// with free spacing + +>> Match( "test", "(?# this is # a test)st", MatchOptions.FreeSpacing ) // # isn't seen because it is before the closing ) of the inline comment +{FullMatch:"st",StartMatch:3} + +>> Match( "test", " # (?# this is a test)st", MatchOptions.FreeSpacing ) // # inline comment entirely within # comment +{FullMatch:"",StartMatch:1} + +>> Match( "test", " # (?# this is " & Char(10) & " t es t", MatchOptions.FreeSpacing ) // # unclosed inline comment within # comment +{FullMatch:"test",StartMatch:1} + +>> Match( "test", "(?x) # (?# this is a test ( with an open paren )" & Char(10) & "st" ) +{FullMatch:"st",StartMatch:3} + +>> Match( "test", "(?x) # (?# this is a test \) with an escaped close paren ) " & Char(13) & "st" ) // can't escape a paren in an inline comment +{FullMatch:"st",StartMatch:3} + +// FREE SPACING NEWLINES SUPPORTED + +>> Match( "atestz", "(?x) + # this is free spacing! + + t + + e # e is for elephant + + s + + t # t is for termite + + ").FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)& "# this is free spacing!" &Char(13)&Char(13)& "t" &Char(13)&Char(13) & "e # e is for elephant" &Char(13)&Char(13)& "s" &Char(13)&Char(13)& "t # t is for terminte" &Char(13)&Char(13)).FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)& "# this is free spacing!" &Char(10)&Char(10)& "t" &Char(10)&Char(10) & "e # e is for elephant" &Char(10)&Char(10)& "s" &Char(13)&Char(13)& "t # t is for terminte" &Char(10)&Char(10)).FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)&Char(10)& "# this is free spacing!" &Char(13)&Char(10)&Char(13)&Char(10)& "t" &Char(13)&Char(10)&Char(13)&Char(10)& "e # e is for elephant" &Char(13)&Char(10)&Char(13)&Char(10)& "s" &Char(13)&Char(10)&Char(13)&Char(10)& "t # t is for terminte" &Char(13)&Char(10)&Char(13)&Char(10)).FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)& "# this is free spacing!" &Char(13)& "t" &Char(13)& "e # e is for elephant" &Char(13)& "s" &Char(13)& "t # t is for terminte").FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)& "# this is free spacing!" &Char(10)& "t" &Char(10)& "e # e is for elephant" &Char(10)& "s" &Char(10)& "t # t is for terminte" &Char(10)).FullMatch +"test" + +>> Match( "atestz", "(?x)" &Char(13)&Char(10)& "# this is free spacing!" &Char(13)&Char(10)& "t" &Char(13)&Char(10)& "e # e is for elephant" &Char(13)&Char(10)& "s" &Char(13)&Char(10)& "t # t is for terminte" &Char(13)&Char(10)).FullMatch +"test" + +>> IsMatch( "ab", "(?x)a # b" ) +false + +>> IsMatch( "ab", "(?x)a # " &Char(13)& " b" ) +true + +>> IsMatch( "ab", "(?x)a # " &Char(10)& " b" ) +true + +>> IsMatch( "ab", "(?x)a # " &Char(13)&Char(10)& " b" ) // one is the newline, the other is just whitespace that is ignored +true + +>> IsMatch( "ab", "(?x)a # " &Char(133)& " b" ) // \x85 +false + +// Edge cases for removal during RE translations +>> Match( "1111111122221", "(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)\11", MatchOptions.NumberedSubMatches ) +{FullMatch:"111111112222",StartMatch:1,SubMatches:Table({Value:"1"},{Value:"1"},{Value:"1"},{Value:"1"},{Value:"1"},{Value:"1"},{Value:"1"},{Value:"1"},{Value:"2"},{Value:"2"},{Value:"2"})} + +>> Match( "1111111122221", "(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)\1(?#asdf)1", MatchOptions.NumberedSubMatches ) +Blank() + +>> Match( "1111111122221", "(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)\1#asdf" & Char(13) & "1", MatchOptions.NumberedSubMatches & MatchOptions.FreeSpacing ) +Blank() + +// MatchOptions.Contains doesn't strip free spacing on PCRE2, good to compare results + +>> IsMatch( "ab", "(?x)a # c", MatchOptions.Contains ) +true + +>> IsMatch( "ab", "(?x)a # " &Char(13)& " c", MatchOptions.Contains ) +false + +>> IsMatch( "ab", "(?x)a # " &Char(10)& " c", MatchOptions.Contains ) +false + +>> IsMatch( "ab", "(?x)a # " &Char(13)&Char(10)& " c", MatchOptions.Contains ) // one is the newline, the other is just whitespace that is ignored +false + +>> IsMatch( "ab", "(?x)a # " &Char(133)& " c", MatchOptions.Contains ) // \x85 +true + +>> IsMatch( "ab", "(?x)a # " &Char(12)& " c", MatchOptions.Contains ) // \f +true + +>> IsMatch( "ab", "(?x)a # " &Char(11)& " c", MatchOptions.Contains ) // \v +true + +>> IsMatch( "ab", "(?x)a # " &Char(9)& " c", MatchOptions.Contains ) // \t +true + +>> IsMatch( "ab", "(?x)a # " &UniChar(2028)& " c", MatchOptions.Contains ) // \u2028 +true + +>> IsMatch( "ab", "(?x)a # " &UniChar(2029)& " c", MatchOptions.Contains ) // \u2029 +true + +>> IsMatch( "ac", "(?x)a # \n b" & Char(10) & "c", MatchOptions.Contains ) // \n doesn't terminate comment +true + +>> IsMatch( "ac", "(?x)a # \r b" & Char(10) & "c", MatchOptions.Contains ) // \r doesn't terminate comment +true + +// ESCAPED # + +>> Match( "ab#cd", "ab#cd" ) +{FullMatch:"ab#cd",StartMatch:1} + +>> Match( "ab#cd", "ab\#cd" ) +{FullMatch:"ab#cd",StartMatch:1} + +>> Match( "ab#cd", "ab[#]cd" ) +{FullMatch:"ab#cd",StartMatch:1} + +>> Match( "ab#cd", "a" & Char(10) & " b # c " & Char(10) & "d", MatchOptions.FreeSpacing ) // c is in a # comment +Blank() + +>> Match( "abd", "a" & Char(10) & " b # c " & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"abd",StartMatch:1} + +>> Match( "ab#cd", "a" & Char(10) & " b \# c" & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab#cd",StartMatch:1} + +>> Match( "ab#cd", "a" & Char(10) & " b [#] c" & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab#cd",StartMatch:1} + +>> Match( "ab#cd", "a" & Char(10) & " b [\#] c" & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab#cd",StartMatch:1} + +// ESCAPED SPACE + +>> Match( "ab cd", "ab cd" ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "ab\ cd" ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "ab[ ]cd" ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "ab[\ ]cd" ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "a" & Char(10) & " b c " & Char(10) & "d", MatchOptions.FreeSpacing ) // no actual space between b and c +Blank() + +>> Match( "ab cd", "a" & Char(10) & " b \ c " & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "a" & Char(10) & " b [ ] c" & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab cd",StartMatch:1} + +>> Match( "ab cd", "a" & Char(10) & " b [\ ] c" & Char(10) & "d", MatchOptions.FreeSpacing ) +{FullMatch:"ab cd",StartMatch:1} + +// FREE SPACING IGNORED SPACE CHARACTERS + +>> IsMatch( "ab", "a b", MatchOptions.FreeSpacing) // space +true + +>> IsMatch( "ab", "a" &UniChar(9)& "b", MatchOptions.FreeSpacing) // \t +true + +>> IsMatch( "ab", "a" &UniChar(10)& "b", MatchOptions.FreeSpacing) // \n +true + +>> IsMatch( "ab", "a" &UniChar(13)& "b", MatchOptions.FreeSpacing) // \r +true + +>> IsMatch( "ab", "a" &UniChar(12)& "b", MatchOptions.FreeSpacing) // \f +true + +>> IsMatch( "ab", "a" &UniChar(11)& "b", MatchOptions.FreeSpacing) // \v +false + +>> IsMatch( "ab", "a" &UniChar(160)& "b", MatchOptions.FreeSpacing) // \u00a0 +false + +>> IsMatch( "ab", "a" &UniChar(5760)& "b", MatchOptions.FreeSpacing) // \u1680 +false + +>> IsMatch( "ab", "a" &UniChar(8192)& "b", MatchOptions.FreeSpacing) // \u2000 +false + +>> IsMatch( "ab", "a" &UniChar(8193)& "b", MatchOptions.FreeSpacing) // \u2001 +false + +>> IsMatch( "ab", "a" &UniChar(8194)& "b", MatchOptions.FreeSpacing) // \u2002 +false + +>> IsMatch( "ab", "a" &UniChar(8195)& "b", MatchOptions.FreeSpacing) // \u2003 +false + +>> IsMatch( "ab", "a" &UniChar(8196)& "b", MatchOptions.FreeSpacing) // \u2004 +false + +>> IsMatch( "ab", "a" &UniChar(8197)& "b", MatchOptions.FreeSpacing) // \u2005 +false + +>> IsMatch( "ab", "a" &UniChar(8198)& "b", MatchOptions.FreeSpacing) // \u2006 +false + +>> IsMatch( "ab", "a" &UniChar(8199)& "b", MatchOptions.FreeSpacing) // \u2007 +false + +>> IsMatch( "ab", "a" &UniChar(8200)& "b", MatchOptions.FreeSpacing) // \u2008 +false + +>> IsMatch( "ab", "a" &UniChar(8201)& "b", MatchOptions.FreeSpacing) // \u2009 +false + +>> IsMatch( "ab", "a" &UniChar(8202)& "b", MatchOptions.FreeSpacing) // \u200a +false + +>> IsMatch( "ab", "a" &UniChar(8232)& "b", MatchOptions.FreeSpacing) // \u2028 +false + +>> IsMatch( "ab", "a" &UniChar(8233)& "b", MatchOptions.FreeSpacing) // \u2029 +false + +>> IsMatch( "ab", "a" &UniChar(8239)& "b", MatchOptions.FreeSpacing) // \u202f +false + +>> IsMatch( "ab", "a" &UniChar(8287)& "b", MatchOptions.FreeSpacing) // \u205f +false + +>> IsMatch( "ab", "a" &UniChar(12288)& "b", MatchOptions.FreeSpacing) // \u3000 +false + +>> IsMatch( "ab", "a" &UniChar(65279)& "b", MatchOptions.FreeSpacing) // \ufeff +false + +// spaces are not merely removed, they are no-ops and regular expression tokens still end at them + +>> IsMatch( UniChar(123) & "4", "(?x)\u123 4" ) // too few characters for \u +Errors: Error 29-42: Invalid regular expression: Invalid escape code, found "\u".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> Match( "aBa1", "(?x)(a)(((((((((((((((B)))))))))))))))\1 1", MatchOptions.NumberedSubMatches ).FullMatch +"aBa1" + +>> Match( "aBa1", "(?x)(a)(((((((((((((((B)))))))))))))))\11", MatchOptions.NumberedSubMatches ).FullMatch +Blank() + +>> Match( "aBa1", "(?x)(a)(((((((((((((((B)))))))))))))))\1(?#comment)1", MatchOptions.NumberedSubMatches ).FullMatch +"aBa1" + +>> Match( "aBa1", "(a)(((((((((((((((B)))))))))))))))\1(?#comment)1", MatchOptions.NumberedSubMatches ).FullMatch +"aBa1" + +>> Match( "aaaaaa", "(?x)a {3}" ) +{FullMatch:"aaa",StartMatch:1} + +>> Match( "aaaaaaaaaaaaaaaaaaaaaaaaaa", "(?x)a { 3 }" ) +Errors: Error 37-56: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaaaaaaaaaaaaaaaaaaaaaaa", "(?x)a { 1 2 }" ) +Errors: Error 37-58: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaaa", "(?x)a +" ) +{FullMatch:"aaaaaa",StartMatch:1} + diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Limited.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Limited.txt new file mode 100644 index 0000000000..f4caf16cfc --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Limited.txt @@ -0,0 +1,1415 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules,SupportColumnNamesAsIdentifiers + +// Power Fx regular expressions are limited to features that can be transpiled to native .NET (C# Interpreter), ECMAScript (Canvas), or PCRE2 (Excel). +// We want the same results everywhere for Power Fx, even if the underlying implementation is different. Even with these limits in place there are some minor semantic differences but we get as close as we can. +// These tests can be run through all three engines and the results compared with by setting ExpressionEvaluationTests.RegExCompareEnabled, a PCRE2 DLL and NodeJS must be installed on the system. +// +// In short, we use the insersection of canonical .NET regular expressions and ECMAScript 2024's "v" flag for escaping rules. +// Someday when "v" is more widely avaialble, we can support more of its features such as set subtraction. +// We chose to use canonical .NET instead of RegexOptions.ECMAScript because we wanted the unicode definitions for words. See https://learn.microsoft.com/dotnet/standard/base-types/regular-expression-options#ecmascript-matching-behavior +// +// In addition, Power Fx regular expressions are opinionated and try to eliminate some of the ambiguity in the common regular expression language: +// Numbered capture groups are disabled by default, and cannot be mixed with named capture groups. +// Octal character codes are not supported, use \x or \u instead. +// Literal ^, -, [, ], {, and } must be escaped when used in a character class. +// Escaping is only supported for special characters and unknown alphanumeric escape sequences are not supported. +// Unicode characters are used throughout. +// Newlines support Windows friendly \r\n as well as \r and \n. +// +// Features that are supported: +// Literal characters. Any character except the special characters [ ] \ ^ $ . | ? * + ( ) can be inserted directly. +// Escaped special characters. \ (backslash) followed by a special character to insert it directly, includes \- when in a character class. +// Operators +// Dot (.), matches everything except [\r\n] unless MatchOptions.DotAll is used. +// Anchors, ^ and $, matches the beginning and end of the string, or of a line if MatchOptions.Multiline is used. +// Quanitfiers +// Greedy quantifiers. ? matches 0 or 1 times, + matches 1 or more times, * matches 0 or more times, {3} matches exactly 3 times, {1,} matches at least 1 time, {1,3} matches between 1 and 3 times. By default, matching is "greedy" and the match will be as large as possible. +// Lazy quantifiers. Same as the greedy quantifiers followed by ?, for example *? or {1,3}?. With the lazy modifier, the match will be as small as possible. +// Alternation. a|b matches "a" or "b". +// Character classes +// Custom character class. [abc] list of characters, [a-fA-f0-9] range of characters, [^a-z] everything but these characters. Character classes cannot be nested, subtracted, or intersected, and the same special character cannot be repeated in the character class. +// Word characters and breaks. \w, \W, \b, \B, using the Unicode definition of letters [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}\p{Pc}\p{Lm}]. +// Digit characters. \d includes the digits 0-9 and \p{Nd}, \D matches everything except characters matched by \d. +// Space characters. \s includes spacing characters [ \r\n\t\f\x0B\x85\p{Z}], \S which matches everything except characters matched by \s, \r carriage return, \n newline, \t tab, \f form feed. +// Control characters. \cA, where the control character is [A-Za-z]. +// Hexadecimal and Unicode character codes. \x20 with two hexadecimal digits, \u2028 with four hexadecimal digits. +// Unicode character class and property. \p{Ll} matches all Unicode lowercase letters, while \P{Ll} matches everything that is not a Unicode lowercase letter. +// Capture groups +// Non capture group. (?:a), group without capturing the result as a named or numbered sub-match. +// Named group and back reference. (?chars) captures a sub-match with the name name, referenced with \k. Cannot be used if MatchOptions.NumberedSubMatches is enabled. +// Numbered group and back referencs. (a|b) captures a sub-match, referenced with \1. MatchOptions.NumberedSubMatches must be enabled. +// Lookahead and lookbehind. (?=a), (?!a), (?<=b), (?> Match( "asdf", "a(sdf)" ) +{FullMatch:"asdf",StartMatch:1} + +// ErrInvalidRegExUnclosedCaptureGroups +>> Match( "asdf", "a(sdf" ) +Errors: Error 15-22: Invalid regular expression: Unclosed groups, too few closing parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExUnopenedCaptureGroups +>> Match( "asdf", "asdf)" ) +Errors: Error 15-22: Invalid regular expression: Unopened groups, too few opening parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +// Self referencing groups are disallowed + +// ErrInvalidRegExNumberedSubMatchesDisabled +>> Match( "aa", "(a\1)" ) +Errors: Error 13-20: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadBackRefSelfReferencing +>> Match( "aa", "(a\1)", MatchOptions.NumberedSubMatches ) +Errors: Error 13-20: Invalid regular expression: Self-referencing capture groups are not supported, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aa", "(?a\k)" ) +Errors: Error 13-35: Invalid regular expression: Self-referencing capture groups are not supported, found "\k".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aa aaaa aaaaaa ", "((a+)(\1) ?)+" ) +Errors: Error 26-41: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aa aaaa aaaaaa ", "((a+)(\1) ?)+", MatchOptions.NumberedSubMatches ) +Errors: Error 26-41: Invalid regular expression: Self-referencing capture groups are not supported, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aa aaaa aaaaaa ", "(?(a+)(\k) ?)+" ) +Errors: Error 26-56: Invalid regular expression: Self-referencing capture groups are not supported, found "\k".|Error 0-5: The function 'Match' has some invalid arguments. + +// Backreferences without a group are disallowed + +>> Match( "hello howdy", "([hi]).*\1" ) +Errors: Error 22-34: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadBackRefNotDefined +>> Match( "hello howdy", "([hi]).*\k<1>" ) +Errors: Error 22-37: Invalid regular expression: Capture group "\k<1>" not defined.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello howdy", "(?[hi]).*\k" ) +{FullMatch:"hello h",StartMatch:1,first:"h"} + +// ErrInvalidRegExMixingNamedAndNumberedSubMatches +>> Match( "hello howdy", "(?[hi]).*\k", MatchOptions.NumberedSubMatches ) +Errors: Error 22-49: Invalid regular expression: Named captures cannot be used with MatchOptions.NumberedSubMatches enabled, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello howdy", "(?[hi]).*\1" ) +Errors: Error 22-42: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello howdy", "([hi]).*\1", MatchOptions.NumberedSubMatches ) +{FullMatch:"hello h",StartMatch:1,SubMatches:Table({Value:"h"})} + +>> Match( "hello howdy", "(?[hi]).*\k" ) +Errors: Error 22-50: Invalid regular expression: Capture group "\k" not defined.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(((((((((((?l))))))))))).*\k") // 11 parens +{FullMatch:"llo worl",StartMatch:3,eleven:"l"} + +>> Match( "hello world", "(((((((((((l))))))))))).*\11") // 11 parens +Errors: Error 22-52: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\11".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(((((((((((l))))))))))).*\11", MatchOptions.NumberedSubMatches) // 11 parens +{FullMatch:"llo worl",StartMatch:3,SubMatches:Table({Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"})} + +>> Match( "hello world", "(((((((((((?l)))))))))).*\k") // unclosed 11th paren +Errors: Error 22-67: Invalid regular expression: Unclosed groups, too few closing parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(?((((((((((?l)))))))))).*\k") // unclosed 11th paren +Errors: Error 22-74: Invalid regular expression: Self-referencing capture groups are not supported, found "\k".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(?(?(?(?(?(?(?(?(?(?(l))))))))))).*\k") // 11 parens +{FullMatch:"llo worl",StartMatch:3,a:"l",b:"l",c:"l",d:"l",e:"l",f:"l",g:"l",h:"l",i:"l",j:"l"} + +>> Match( "hello world", "(?(?(?(?(?(?(?(?(?(?(l))))))))))).*\k") // 11 parens +{FullMatch:"llo worl",StartMatch:3,a:"l",b:"l",c:"l",d:"l",e:"l",f:"l",g:"l",h:"l",i:"l",j:"l"} + +>> Match( "hello world", "(((((((((((l))))))))))).*\1") // 11 parens +Errors: Error 22-51: Invalid regular expression: Use named captures with "(?...)" and "\k" or enable MatchOptions.NumberedSubMatches, found "\1".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(((((((((((l))))))))))).*\1", MatchOptions.NumberedSubMatches) // 11 parens +{FullMatch:"llo worl",StartMatch:3,SubMatches:Table({Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"})} + +>> Match( "hello world", "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "(?l)" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & ".*\k" ) +{FullMatch:"llo worl",StartMatch:3,hundredone:"l"} + +>> Match( "hello world", "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "(l)" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & ".*\101", MatchOptions.NumberedSubMatches ) +{FullMatch:"llo worl",StartMatch:3,SubMatches:Table({Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"},{Value:"l"})} + +>> Match( "hello world", "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "((((((((((" & "(?l" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & "))))))))))" & ".*\k" ) // missing paren +Errors: Error 341-342: Invalid regular expression: Unclosed groups, too few closing parenthesis.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(?l)" & ".*\k" ) +{FullMatch:"llo worl",StartMatch:3,hundredone:"l"} + +>> Match( "hello world", "(((())(())(())(((((((())))))))))()(?l)()\k") +{FullMatch:"ll",StartMatch:3,letter:"l"} + +>> Match( "hello world", "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(?l)" & ".*\k", MatchOptions.NumberedSubMatches ) +Errors: Error 491-492: Invalid regular expression: Named captures cannot be used with MatchOptions.NumberedSubMatches enabled, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(((())(())(())(((((((())))))))))()(?l)()\k", MatchOptions.NumberedSubMatches) +Errors: Error 22-82: Invalid regular expression: Named captures cannot be used with MatchOptions.NumberedSubMatches enabled, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)(z*)" & "(l)" & ".*\101", MatchOptions.NumberedSubMatches ) +{FullMatch:"llo worl",StartMatch:3,SubMatches:Table({Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:"l"})} + +>> Match( "hello world", "(((())(())(())(((((((())))))))))()(l)()\18", MatchOptions.NumberedSubMatches) +{FullMatch:"ll",StartMatch:3,SubMatches:Table({Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:""},{Value:"l"},{Value:""})} + +// balancing groups + +// ErrInvalidRegExBadBalancing +>> Match( "(hello world)", "(?)a") +Errors: Error 24-35: Invalid regular expression: Balancing capture groups are not supported, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "(hello world)", "(?)a(?<-s>)b") +Errors: Error 24-41: Invalid regular expression: Balancing capture groups are not supported, found "(?<-s>".|Error 0-5: The function 'Match' has some invalid arguments. + +// groups with single ticks + +// ErrInvalidRegExBadSingleQuoteNamedCapture +>> Match( "(hello world)", "(?'name'l)") +Errors: Error 24-36: Invalid regular expression: Using single quoted named captures is not supported, use (?<...>) syntax instead, found "(?'name'".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "(hello world)", "(?'s-e'l)") +Errors: Error 24-35: Invalid regular expression: Using single quoted named captures is not supported, use (?<...>) syntax instead, found "(?'s-e'".|Error 0-5: The function 'Match' has some invalid arguments. + +// Octal characters are not allowed + +// ErrInvalidRegExBadOctal +>> Match( "as$df", "\044" ) +Errors: Error 16-22: Invalid regular expression: Octal \0 character codes are not supported, use hexadecimal \x or Unicode \u instead, found "\044".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "as$df", "\0" ) +Errors: Error 16-20: Invalid regular expression: Octal \0 character codes are not supported, use hexadecimal \x or Unicode \u instead, found "\0".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "as$df", "\000" ) +Errors: Error 16-22: Invalid regular expression: Octal \0 character codes are not supported, use hexadecimal \x or Unicode \u instead, found "\000".|Error 0-5: The function 'Match' has some invalid arguments. + +// Can't define named capture group more than once + +>> Match( "test", "(?t).*(?t)" ) +{FullMatch:"test",StartMatch:1,one:"t",two:"t"} + +>> Match( "test", "((?t)|(?t))" ) +{FullMatch:"t",StartMatch:1,one:"t",two:Blank()} + +// ErrInvalidRegExBadNamedCaptureAlreadyExists +>> Match( "test", "(?t).*(?t)" ) +Errors: Error 15-37: Invalid regular expression: Named capture group "(?" defined more than once.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "((?t)|(?t))" ) +Errors: Error 15-38: Invalid regular expression: Named capture group "(?" defined more than once.|Error 0-5: The function 'Match' has some invalid arguments. + +// Bad named capture group names + +>> Match( "test", "(?s).*" ) +{FullMatch:"st",StartMatch:3,a:"s"} + +>> Match( "test", "(?s).*" ) +{FullMatch:"st",StartMatch:3,a1:"s"} + +// ErrInvalidRegExBadNamedCaptureName +>> Match( "test", "(?<1>s).*" ) +Errors: Error 15-26: Invalid regular expression: Named capture group name must be a combination of letters and digits and begin with a letter, found "(?<1>".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "(?<1a>s).*" ) +Errors: Error 15-27: Invalid regular expression: Named capture group name must be a combination of letters and digits and begin with a letter, found "(?<1a>".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "(?s).*" ) +Errors: Error 15-27: Invalid regular expression: Named capture group name must be a combination of letters and digits and begin with a letter, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "(?s).*" ) +Errors: Error 15-27: Invalid regular expression: Named capture group name must be a combination of letters and digits and begin with a letter, found "(?".|Error 0-5: The function 'Match' has some invalid arguments. + +// Group name case sensitivity + +>> Match( "test", "(?t).*\k") +{FullMatch:"test",StartMatch:1,a:"t"} + +>> Match( "test", "(?t).*\k") +Errors: Error 15-31: Invalid regular expression: Capture group "\k" not defined.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "(?t).*\k") +Errors: Error 15-31: Invalid regular expression: Capture group "\k" not defined.|Error 0-5: The function 'Match' has some invalid arguments. + +// conditional alternation + +// ErrInvalidRegExBadConditional +// Console.WriteLine( Regex.Match( "1-23-456-7890", @"(?(\d{2}-)\d{2}-\d{2}|\d{3}-\d{2})" ).Value ); +>> Match( "1-23-456-7890", "(?(\d{2}-)\d{2}-\d{2}|\d{3}-\d{2})" ) +Errors: Error 24-60: Invalid regular expression: Conditional alternation is not supported, found "(?(".|Error 0-5: The function 'Match' has some invalid arguments. + +// Console.WriteLine( Regex.Match( "hello world", @"(e)(?(1)l|d)" ).Value ); +>> Match( "hello world", "(e)(?(1)l|d)" ) +Errors: Error 22-36: Invalid regular expression: Conditional alternation is not supported, found "(?(".|Error 0-5: The function 'Match' has some invalid arguments. + +// support for \r in compile time checks + +>> ShowColumns( Match( "ab", "(?x)a # " & Char(13) & " (?b)" ), capture ) +{capture:"b"} + +>> ShowColumns( Match( "ab", "(?x)a # " & Char(10) & " (?b)" ), capture ) +{capture:"b"} + +>> ShowColumns( Match( "ab", "a # " & Char(13) & " (?b)", MatchOptions.FreeSpacing ), capture ) +{capture:"b"} + +>> ShowColumns( Match( "ab", "a # " & Char(10) & " (?b)", MatchOptions.FreeSpacing ), capture ) +{capture:"b"} + +// quantified capture groups + +// Node != Net for this test ("b" is empty for Node), a known difference between PCRE2 and Perl too +// >> Match( "aba", "^(a(b)?)+$", MatchOptions.NumberedSubMatches ) +// {FullMatch:"aba",StartMatch:1,SubMatches:Table({Value:"a"},{Value:"b"})} + +>> Match( "aba", "^(a(?:b)?)+$", MatchOptions.NumberedSubMatches ) +{FullMatch:"aba",StartMatch:1,SubMatches:Table({Value:"a"})} + +>> Match( "aba", "^(a(b)?)+$" ) +{FullMatch:"aba",StartMatch:1} + +// Node != Net for this test +// >> Match( "b", "(a)?b\1", MatchOptions.NumberedSubMatches ) +// Blank() + +// Node != Net for this test +// >> Match( "b", "(?a)?b\k", MatchOptions.NumberedSubMatches ) +// Blank() + +// difference between null and empty string + +>> Match( "ab", "a(d)?b", MatchOptions.NumberedSubMatches ) +{FullMatch:"ab",StartMatch:1,SubMatches:Table({Value:Blank()})} + +>> Match( "ab", "a(d?)b", MatchOptions.NumberedSubMatches ) +{FullMatch:"ab",StartMatch:1,SubMatches:Table({Value:""})} + +>> Match( "ab", "a(?d)?b" ) +{FullMatch:"ab",StartMatch:1,one:Blank()} + +>> Match( "ab", "a(?d?)b" ) +{FullMatch:"ab",StartMatch:1,one:""} + +// +// CHARACTER CLASSES +// + +// unclosed + +>> Match( "asdf", "asd[asdf]" ) +{FullMatch:"asdf",StartMatch:1} + +>> Match( "asdf", "asd[asdf" ) +Errors: Error 15-25: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "asd[".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "asd]asdf" ) +Errors: Error 15-25: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "asd]".|Error 0-5: The function 'Match' has some invalid arguments. + +// character class and literal square brackets + +// ErrInvalidRegExEmptyCharacterClass +>> Match( "a", "[]" ) +Errors: Error 12-16: Invalid regular expression: Square bracket character classes cannot be empty, found "[]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "]", "[]]" ) +Errors: Error 12-17: Invalid regular expression: Square bracket character classes cannot be empty, found "[]".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadSquare +>> Match( "[", "[[]" ) +Errors: Error 12-17: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcdef]ghijk", "[\w]\w]" ) +Errors: Error 23-32: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[\w]\w]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcdef]ghijk", "asdfasdfasdfsadf[\w]\w]" ) +Errors: Error 23-48: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "...f[\w]\w]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "a]", "[a]]" ) +Errors: Error 13-19: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[a]]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcdef]ghijk", "[\w\]\w]" ) // escaped closing square bracket +{FullMatch:"a",StartMatch:1} + +>> Match( "[", "[\[]" ) +{FullMatch:"[",StartMatch:1} + +>> Match( "]", "[\]]" ) +{FullMatch:"]",StartMatch:1} + +>> Match( ">test[", "[\w\[>]+" ) +{FullMatch:">test[",StartMatch:1} + +>> Match( ">test[", "[\w\]>]+" ) +{FullMatch:">test",StartMatch:1} + +// ErrInvalidRegExLiteralHyphenInCharacterClass +>> Match( "aaaaa", "[-a]" ) +Errors: Error 16-22: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[-a]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[]a]" ) +Errors: Error 16-22: Invalid regular expression: Square bracket character classes cannot be empty, found "[]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[[a]" ) +Errors: Error 16-22: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExUnescapedCharInCharacterClass +>> Match( "aaaaa", "[a^]" ) +Errors: Error 16-22: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "^".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[]" ) +Errors: Error 16-20: Invalid regular expression: Square bracket character classes cannot be empty, found "[]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[a(b)]" ) +Errors: Error 16-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "(".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[a(b]" ) +Errors: Error 16-23: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "(".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "aaaaa", "[a)b]" ) +Errors: Error 16-23: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of ")".|Error 0-5: The function 'Match' has some invalid arguments. + +// character class subtraction + +// ErrInvalidRegExBadSquare +>> Match( "k", "[a-z-[b-c]]" ) +Errors: Error 12-25: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +// repeated characters in character class, used by intersection and future character class features, also would catch POSIX cases if wasn't already blocked by nested square brackets + +>> Match( "hello", "[a-z&&[k-m]]" ) +Errors: Error 16-30: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello", "[a-z&&k-m]" ) +Errors: Error 16-28: Invalid regular expression: Character appears more than once in character class, found repeated "&&".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello", "[a-hh-z]" ) +{FullMatch:"h",StartMatch:1} + +// ErrInvalidRegExBadSquare +>> Match( "HellO", "[[:lower:]]" ) +Errors: Error 16-29: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello", "[[:s:]]" ) +Errors: Error 16-25: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello", "[[=x=]]" ) +Errors: Error 16-25: Invalid regular expression: Literal square braces must be escaped with a backslash even in character classes, for example \[ or \], found at the end of "[".|Error 0-5: The function 'Match' has some invalid arguments. + +// literal curly braces + +// ErrInvalidRegExBadCurly +>> Match( "asdf", "{}" ) +Errors: Error 15-19: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "asdf{}" ) +Errors: Error 15-23: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "as{}" ) +Errors: Error 15-21: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "as{" ) +Errors: Error 15-20: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf{}", "}" ) +Errors: Error 17-20: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "[{]" ) +Errors: Error 15-20: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "[}]" ) +Errors: Error 15-20: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "[{}]" ) +Errors: Error 15-21: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf{}", "\{\}" ) +{FullMatch:"{}",StartMatch:5} + +>> Match( "asdf{}", "\{" ) +{FullMatch:"{",StartMatch:5} + +>> Match( "asdf{}", "\}" ) +{FullMatch:"}",StartMatch:6} + +>> Match( "asdf{}", "[\{\}]" ) +{FullMatch:"{",StartMatch:5} + +>> Match( "asdf{}", "[\{\}]+" ) +{FullMatch:"{}",StartMatch:5} + +>> Match( "asdf{}", "[\{]" ) +{FullMatch:"{",StartMatch:5} + +>> Match( "asdf{}", "[\}]" ) +{FullMatch:"}",StartMatch:6} + +// +// ESCAPES +// + +// ErrInvalidRegExBadEscape +>> Match( "test", "\a" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\a".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\A" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\A".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "$test atest test", "\btest" ) +{FullMatch:"test",StartMatch:2} + +>> Match( "$test atest test", "\Btest" ) +{FullMatch:"test",StartMatch:8} + +>> DropColumns( Match( "test" & Char(10) & "bed", "\cj" ), FullMatch ) +{StartMatch:5} + +>> DropColumns( Match( "test" & Char(10) & "bed", "\cJ" ), FullMatch ) +{StartMatch:5} + +>> DropColumns( Match( "test" & Char(13) & "bed", "\cm" ), FullMatch ) +{StartMatch:5} + +>> DropColumns( Match( "test" & Char(13) & "bed", "\cM" ), FullMatch ) +{StartMatch:5} + +>> Match( "test", "\C" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\C".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test123bed", "\d+" ) +{FullMatch:"123",StartMatch:5} + +>> Match( "test123bed", "\D+" ) +{FullMatch:"test",StartMatch:1} + +>> Match( "test", "\e" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\e".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\E" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\E".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test"&Char(12)&"bed", "\f" ) +{FullMatch:" ",StartMatch:5} + +>> Match( "test", "\F" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\F".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\g" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\g".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\G" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\G".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\h" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\h".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\H" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\H".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\i" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\i".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\I" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\I".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\j" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\j".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\J" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\J".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1234551234", "(?\d)\k" ) +{FullMatch:"55",StartMatch:5,first:"5"} + +>> Match( "test", "\K" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\K".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\l" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\l".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\L" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\L".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\m" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\m".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\M" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\M".|Error 0-5: The function 'Match' has some invalid arguments. + +>> DropColumns( Match( "test" & Char(10) & "bed", "\n" ), FullMatch ) +{StartMatch:5} + +>> Match( "test", "\N" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\N".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\o" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\o".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\O" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\O".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "123test456", "\p{L}+" ) +{FullMatch:"test",StartMatch:4} + +>> Match( "foo123test456", "\P{L}+" ) +{FullMatch:"123",StartMatch:4} + +>> Match( "test", "\q" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\q".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\Q" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\Q".|Error 0-5: The function 'Match' has some invalid arguments. + +>> DropColumns( Match( "test" & Char(13) & "bed", "\r" ), FullMatch ) +{StartMatch:5} + +>> Match( "test", "\R" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\R".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test bed", "\s+" ) +{FullMatch:" ",StartMatch:5} + +>> Match( " test ", "\S+" ) +{FullMatch:"test",StartMatch:4} + +>> Match( "test" & Char(9) & "bed", "\t" ) +{FullMatch:" ",StartMatch:5} + +>> Match( "test", "\T" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\T".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\u0065" ) +{FullMatch:"e",StartMatch:2} + +>> Match( "test", "\U" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\U".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test" & Char(11) & "bed", "\v" ) +Errors: Error 34-38: Invalid regular expression: Invalid escape code, found "\v".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\V" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\V".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "&*%test bed^%$", "\w+" ) +{FullMatch:"test",StartMatch:4} + +>> Match( "test%bed", "\W" ) +{FullMatch:"%",StartMatch:5} + +>> Match( "test", "\x65" ) +{FullMatch:"e",StartMatch:2} + +>> Match( "test", "\X" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\X".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\y" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\y".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\Y" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\Y".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\z" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\z".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\Z" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\Z".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\_" ) +Errors: Error 15-19: Invalid regular expression: Invalid escape code, found "\_".|Error 0-5: The function 'Match' has some invalid arguments. + +// negated escape class and \b can't appear in a character classes, can't easily transpile without character class subtraction or intersection in ECMAScript + +// ErrInvalidRegExBadEscapeInsideCharacterClass +>> Match( "$test" & Char(8) & "test", "[\b]test" ) +Errors: Error 35-45: Invalid regular expression: Escape character not permitted within character class, found "\b".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "$test" & Char(8) & "test", "[\B]test" ) +Errors: Error 35-45: Invalid regular expression: Escape character not permitted within character class, found "\B".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test123bed", "[\D]+" ) +{FullMatch:"test",StartMatch:1} + +// ErrInvalidRegExBadEscapeInsideNegativeCharacterClass +>> Match( "test123bed", "[^\D]+" ) +Errors: Error 21-29: Invalid regular expression: Negative escape character not permitted within negated character class, found "\D".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "$test" & Char(8) & "test", "[\W]test" ) +{FullMatch:"$test",StartMatch:1} + +>> Match( "$test" & Char(8) & "test", "[^\W]test" ) +Errors: Error 35-46: Invalid regular expression: Negative escape character not permitted within negated character class, found "\W".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "$test" & Char(8) & "test", "[\S]test" ) +{FullMatch:"$test",StartMatch:1} + +>> Match( "$test" & Char(8) & "test", "[^\S]test" ) +Errors: Error 35-46: Invalid regular expression: Negative escape character not permitted within negated character class, found "\S".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "foo123test456", "[\P{L}]+" ) +{FullMatch:"123",StartMatch:4} + +>> Match( "foo123test456", "[^\P{L}]+") // would be problematic if we wanted to implement MatchOptions.LocaleAware in the future +Errors: Error 24-35: Invalid regular expression: Negative escape character not permitted within negated character class, found "\P{L}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "$test" & Char(8) & "test", "[\w]test" ) // \w is OK +Blank() + +>> Match( "$test" & Char(8) & "atest", "[\w]test" ) // \w is OK +{FullMatch:"atest",StartMatch:7} + +// Limits on character classes + +>> Match( "test", "\c@" ) +Errors: Error 15-20: Invalid regular expression: Invalid escape code, found "\c".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\xF" ) +Errors: Error 15-20: Invalid regular expression: Invalid escape code, found "\x".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "eF", "\x65F" ) // will only use the first two characters for the hex code and leave the F to match separately +{FullMatch:"eF",StartMatch:1} + +>> Match( "test", "\uF" ) +Errors: Error 15-20: Invalid regular expression: Invalid escape code, found "\u".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\uFF" ) +Errors: Error 15-21: Invalid regular expression: Invalid escape code, found "\u".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\uFFF" ) +Errors: Error 15-22: Invalid regular expression: Invalid escape code, found "\u".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "eF", "\u0065F" ) // will only use the first four characters for the unicode and leave the F to match separately +{FullMatch:"eF",StartMatch:1} + +>> Match( "test", "\p{@}" ) +Errors: Error 15-22: Invalid regular expression: Invalid escape code, found "\p".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test", "\P{@}" ) +Errors: Error 15-22: Invalid regular expression: Invalid escape code, found "\P".|Error 0-5: The function 'Match' has some invalid arguments. + +// Escape characters acceptable to ECMAScript, plus \# and \ for x mode + +>> Match("^$\.*+?()[]{}|/# ", "\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\#\ " ) +{FullMatch:"^$\.*+?()[]{}|/# ",StartMatch:1} + +>> Match("^$\.*+?()[]{}|/# ", "[\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\#\ ]+" ) +{FullMatch:"^$\.*+?()[]{}|/# ",StartMatch:1} + +// Escape characters that are blocked outside a character class + +// ErrInvalidRegExBadEscapeOutsideCharacterClass +>> Match( "!@#%&=-`~><';:,""", "\!" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\!".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\@" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\@".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\%" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\%".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\&" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\&".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\=" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\=".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\-" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\-".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\`" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\`".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\~" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\~".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\>" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\>".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\<" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\<".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\;" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\;".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\:" ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\:".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "!@#%&=-`~><';:,""", "\," ) +Errors: Error 28-32: Invalid regular expression: Escape character not permitted outside a character class, found "\,".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "a#b c", "#" ) +{FullMatch:"#",StartMatch:2} + +>> Match( "a#b c", " " ) +{FullMatch:" ",StartMatch:4} + +>> Match( "a#b c", "[#]" ) +{FullMatch:"#",StartMatch:2} + +>> Match( "a#b c", "[ ]" ) +{FullMatch:" ",StartMatch:4} + +>> Match( "a#b c", "\#" ) +{FullMatch:"#",StartMatch:2} + +>> Match( "a#b c", "\ " ) +{FullMatch:" ",StartMatch:4} + +>> Match( "a#b c", "[\ ]" ) +{FullMatch:" ",StartMatch:4} + +>> Match( "a#b c", "[\#]" ) +{FullMatch:"#",StartMatch:2} + +// +// OPTIONS +// + +// inline options + +>> Match( "hello"&Char(10)&"howdy", "o$" ) +Blank() + +>> Match( "hello"&Char(10)&"howdy", "o$", MatchOptions.Multiline ) +{FullMatch:"o",StartMatch:5} + +>> Match( "hello"&Char(10)&"howdy", "(?im)o$" ) +{FullMatch:"o",StartMatch:5} + +>> Match( "hello"&Char(10)&"howdy", "(?m)o$" ) +{FullMatch:"o",StartMatch:5} + +>> With( Match( "hello"&Char(10)&"howdy", "(?s)hello.howdy" ), {StartMatch: StartMatch, LengthMatch: Len(FullMatch)} ) +{LengthMatch:11,StartMatch:1} + +>> With( Match( "hello"&Char(13)&"howdy", "(?s)hello.howdy" ), {StartMatch: StartMatch, LengthMatch: Len(FullMatch)} ) +{LengthMatch:11,StartMatch:1} + +>> Match( "hello"&Char(13)&Char(10)&"howdy", "(?s)hello.howdy" ) +Blank() + +>> With( Match( "hello"&Char(10)&"howdy", "hello.howdy", MatchOptions.DotAll ), {StartMatch: StartMatch, LengthMatch: Len(FullMatch)} ) +{LengthMatch:11,StartMatch:1} + +>> With( Match( "hello"&Char(13)&"howdy", "hello.howdy", MatchOptions.DotAll ), {StartMatch: StartMatch, LengthMatch: Len(FullMatch)} ) +{LengthMatch:11,StartMatch:1} + +>> Match( "hello"&Char(13)&Char(10)&"howdy", "hello.howdy", MatchOptions.DotAll ) +Blank() + +>> Match( "hello, howdy", "(?x) llo , \s how # comment" ) +{FullMatch:"llo, how",StartMatch:3} + +// unsupported inline options + +// ErrInvalidRegExBadInlineOptions +>> Match( "hello"&Char(10)&"howdy", "(?-m)o$" ) +Errors: Error 33-42: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?-m)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "(?i-m)o$" ) +Errors: Error 33-43: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?i-m)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "(?g)o$" ) +Errors: Error 33-41: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?g)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "(?u)o$" ) +Errors: Error 33-41: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?u)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "(?v)o$" ) +Errors: Error 33-41: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?v)".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadOptionsNotAtFront +>> Match( "hello"&Char(10)&"howdy", "^(?m)o$" ) +Errors: Error 33-42: Invalid regular expression: Inline options must appear at the beginning of the regular expression, found "(?m)" later.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "^(?i-m)o$" ) +Errors: Error 33-44: Invalid regular expression: Inline options must appear at the beginning of the regular expression, found "(?i-m)" later.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello"&Char(10)&"howdy", "^(?m:o$)" ) +Errors: Error 33-43: Invalid regular expression: Inline options must appear at the beginning of the regular expression, found "(?m:" later.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(?n)o") // n accepted for compatibility, but is already turned on +{FullMatch:"o",StartMatch:5} + +// ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches +>> Match( "hello world", "(?n)o", MatchOptions.NumberedSubMatches) // but n not accepted if it contradicts MatchOptions.NumberedSubMatches +Errors: Error 22-29: Invalid regular expression: Inline option is incompatible with MatchOptions.NumberedSubMatches, found "(?n)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "hello world", "(?s)o") +{FullMatch:"o",StartMatch:5} + +>> Match( "hello world", "(?x)o") +{FullMatch:"o",StartMatch:5} + +>> Match( "hello world", "(?msnxi)o") +{FullMatch:"o",StartMatch:5} + +// ErrInvalidRegExBadParen +>> Match ("hello world", "(?^)o") // PCRE2 +Errors: Error 22-29: Invalid regular expression: Unsupported special group, found "(?^".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExRepeatedInlineOption +>> Match ("hello world", "(?xx)o") // PCRE2 +Errors: Error 22-30: Invalid regular expression: Repeated inline option, found "(?xx)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match ("hello world", "(?imsni)o") // PCRE2 +Errors: Error 22-33: Invalid regular expression: Repeated inline option, found "(?imsni)".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadInlineOptions +>> Match( "hello world", "(?A)o") +Errors: Error 22-29: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?A)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match ("hello world", "(?J)o") // PCRE2 +Errors: Error 22-29: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?J)".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match ("hello world", "(?U)o") // PCRE2 +Errors: Error 22-29: Invalid regular expression: Inline options are limited to a combination of the letters [imnsx], cannot disable options, and cannot be used on a subexpression, found "(?U)".|Error 0-5: The function 'Match' has some invalid arguments. + +// Option DotAll + +>> DropColumns( Match( "te" & " " & "t", "te.t", MatchOptions.DotAll ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & Char(10) & "t", "te.t", MatchOptions.DotAll ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & Char(13) & "t", "te.t", MatchOptions.DotAll ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & UniChar(Hex2Dec("2028")) & "t", "te.t", MatchOptions.DotAll ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & UniChar(Hex2Dec("2029")) & "t", "te.t", MatchOptions.DotAll ), FullMatch ) +{StartMatch:1} + +// +// QUANTIFIERS +// + +// greedy and lazy quantifiers + +>> Match( "#abcdef#", "\w+" ).FullMatch +"abcdef" + +>> Match( "abcdef", "\w+?" ).FullMatch +"a" + +>> Match( "abcdef", "\w*" ).FullMatch +"abcdef" + +>> Match( "abcdef", "\w*?" ).FullMatch +"" + +>> Match( "abcdef", "\w?" ).FullMatch +"a" + +>> Match( "abcdef", "\w??" ).FullMatch +"" + +>> Match( "abcdef", "\w{2}" ).FullMatch +"ab" + +>> Match( "abcdef", "\w{2,}" ).FullMatch +"abcdef" + +>> Match( "abcdef", "\w{2,}?" ).FullMatch +"ab" + +>> Match( "abcdef", "\w{2,4}" ).FullMatch +"abcd" + +>> Match( "abcdef", "\w{2,4}?" ).FullMatch +"ab" + +// unclosed + +>> Match( "abcdef", "\w{2" ) +Errors: Error 17-23: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcdef", "\w2}" ) +Errors: Error 17-23: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcdef", "\w{2," ) +Errors: Error 17-24: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrInvalidRegExBadCurly +>> Match( "abcdef", "\w{,2}" ).FullMatch +Errors: Error 17-25: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments.|Error 27-37: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{,2}?" ).FullMatch +Errors: Error 17-26: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments.|Error 28-38: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2" ).FullMatch +Errors: Error 17-23: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "{".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w2}" ).FullMatch +Errors: Error 17-23: Invalid regular expression: Literal curly braces must be escaped with a backslash, found "}".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +// ErrInvalidRegExBadExactQuantifier +>> Match( "abcdef", "\w{2}?" ).FullMatch +Errors: Error 17-25: Invalid regular expression: Exact quantifiers cannot be used with quantifier modifiers such as ? for lazy, found "{2}?".|Error 0-5: The function 'Match' has some invalid arguments.|Error 27-37: Name isn't valid. 'FullMatch' isn't recognized. + +// possessive quantifiers + +// ErrInvalidRegExBadQuantifier +>> Match( "abcdef", "\w++" ).FullMatch +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "++".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w*+" ).FullMatch +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "*+".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w?+" ).FullMatch +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "?+".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2}+" ).FullMatch +Errors: Error 17-25: Invalid regular expression: Exact quantifiers cannot be used with quantifier modifiers such as ? for lazy, found "{2}+".|Error 0-5: The function 'Match' has some invalid arguments.|Error 27-37: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2,}+" ).FullMatch +Errors: Error 17-26: Invalid regular expression: Possessive quantifiers are not supported, found "{2,}+".|Error 0-5: The function 'Match' has some invalid arguments.|Error 28-38: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2,4}+" ).FullMatch +Errors: Error 17-27: Invalid regular expression: Possessive quantifiers are not supported, found "{2,4}+".|Error 0-5: The function 'Match' has some invalid arguments.|Error 29-39: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w+*" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "+*".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w**" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "**".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w?*" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-23: Invalid regular expression: Possessive quantifiers are not supported, found "?*".|Error 0-5: The function 'Match' has some invalid arguments.|Error 25-35: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2}*" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-25: Invalid regular expression: Exact quantifiers cannot be used with quantifier modifiers such as ? for lazy, found "{2}*".|Error 0-5: The function 'Match' has some invalid arguments.|Error 27-37: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2,}*" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-26: Invalid regular expression: Possessive quantifiers are not supported, found "{2,}*".|Error 0-5: The function 'Match' has some invalid arguments.|Error 28-38: Name isn't valid. 'FullMatch' isn't recognized. + +>> Match( "abcdef", "\w{2,4}*" ).FullMatch // in case * is used someday in the future as a quantifier modifier +Errors: Error 17-27: Invalid regular expression: Possessive quantifiers are not supported, found "{2,4}*".|Error 0-5: The function 'Match' has some invalid arguments.|Error 29-39: Name isn't valid. 'FullMatch' isn't recognized. + + +// +// UNICODE +// + +// Unicode letters as word characters are matched + +>> Match( "the whole world", "\b(\w+\s*)+" ) +{FullMatch:"the whole world",StartMatch:1} + +>> Match( "целый мир", "\b(\w+\s*)+" ) +{FullMatch:"целый мир",StartMatch:1} + +>> Match( "el niño", "\b(\w+\s*)+" ) +{FullMatch:"el niño",StartMatch:1} + +>> Match( "Müller", "^\w+$" ) +{FullMatch:"Müller",StartMatch:1} + +// Unicode numbers as digits are matched + +>> Match( "12345", "^\d+$" ) +{FullMatch:"12345",StartMatch:1} + +>> Match( "12٤45", "^\d+$" ) +{FullMatch:"12٤45",StartMatch:1} + +>> Match( "123४5", "^\d+$" ) +{FullMatch:"123४5",StartMatch:1} + +>> Match( "abc3d", "^\D+" ) +{FullMatch:"abc",StartMatch:1} + +>> Match( "abc٤45", "^\D+" ) +{FullMatch:"abc",StartMatch:1} + +>> Match( "abc४5", "^\D+" ) +{FullMatch:"abc",StartMatch:1} + +// See Match_Unicode for character class consistency tests + +>> Match( "1aBc2+", "\p{L}") +{FullMatch:"a",StartMatch:2} + +>> Match( "1Abc2+", "\p{L}") +{FullMatch:"A",StartMatch:2} + +>> Match( "aBc2+", "\P{L}") +{FullMatch:"2",StartMatch:4} + +>> Match( "1aBc2+", "\p{Ll}") +{FullMatch:"a",StartMatch:2} + +>> Match( "aBc2+", "\P{Ll}") +{FullMatch:"B",StartMatch:2} + +>> Match( "1aBc2+", "\p{Lu}") +{FullMatch:"B",StartMatch:3} + +>> Match( "Bc2+", "\P{Lu}") +{FullMatch:"c",StartMatch:2} + +>> Match( "1ῼa", "\p{Lt}" ) // Unicode Character “ῼ” (U+1FFC), Greek Capital Letter Omega with Prosgegrammeni +{FullMatch:"ῼ",StartMatch:2} + +>> Match( "ῼa", "\P{Lt}" ) // Unicode Character “ῼ” (U+1FFC), Greek Capital Letter Omega with Prosgegrammeni +{FullMatch:"a",StartMatch:2} + +>> Match( "1ˁa", "\p{Lm}" ) // Unicode Character “ˁ” (U+02C1), Modifier Letter Reversed Glottal Stop +{FullMatch:"ˁ",StartMatch:2} + +>> Match( "ˁa1", "\P{Lm}" ) // Unicode Character “ˁ” (U+02C1), Modifier Letter Reversed Glottal Stop +{FullMatch:"a",StartMatch:2} + +>> Match( "1ǂa1", "\p{Lo}" ) // Unicode Character “ǂ” (U+01C2), Latin Letter Alveolar Click +{FullMatch:"ǂ",StartMatch:2} + +>> Match( "ǂa1", "\P{Lo}" ) // Unicode Character “ǂ” (U+01C2), Latin Letter Alveolar Click +{FullMatch:"a",StartMatch:2} + +>> Match( "1҉a1", "\p{Me}" ) // Unicode Character “҉” (U+0489), Combining Cyrillic Millions Sign +{FullMatch:"҉",StartMatch:2} + +>> Match( "҉a1", "\P{Me}" ) // Unicode Character “҉” (U+0489), Combining Cyrillic Millions Sign +{FullMatch:"a",StartMatch:2} + +>> Match( "1◌̃a1", "\p{Mn}" ) // Unicode Character “◌̃” (U+0303), Combining Tilde +{FullMatch:"̃",StartMatch:3} + +>> Match( "◌̃a1", "\P{Mn}" ) // Unicode Character “◌̃” (U+0303), Combining Tilde +{FullMatch:"◌",StartMatch:1} + +>> Match( "1ைa1", "\p{Mc}" ) // Unicode Character “ை” (U+0BC8), Tamil Vowel Sign Ai +{FullMatch:"ை",StartMatch:2} + +>> Match( "ைa1", "\P{Mc}" ) // Unicode Character “ை” (U+0BC8), Tamil Vowel Sign Ai +{FullMatch:"a",StartMatch:2} + +// ErrInvalidRegExBadUnicodeCategory +>> Match( "1aBc2+", "\P{Cs}") +Errors: Error 17-25: Invalid regular expression: Invalid Unicode category name, found "\P{Cs}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1aBc2+", "\P{Co}") +Errors: Error 17-25: Invalid regular expression: Invalid Unicode category name, found "\P{Co}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1aBc2+", "\P{Cn}") +Errors: Error 17-25: Invalid regular expression: Invalid Unicode category name, found "\P{Cn}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1aBc2+", "\P{Aa}") +Errors: Error 17-25: Invalid regular expression: Invalid Unicode category name, found "\P{Aa}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1aBc2+", "\P{Lz}") +Errors: Error 17-25: Invalid regular expression: Invalid Unicode category name, found "\P{Lz}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "1aBc2+", "\P{}") +Errors: Error 17-23: Invalid regular expression: Invalid escape code, found "\P".|Error 0-5: The function 'Match' has some invalid arguments. + +// +// CHARACTERS LIMITED IN CHARACTER CLASS +// + +>> Match( "abcd1234", "[\d]") +{FullMatch:"1",StartMatch:5} + +>> Match( "abcd1234", "[\w]") +{FullMatch:"a",StartMatch:1} + +// Single and double characters blocked + +>> Match( "abcd1234", "[{]") +Errors: Error 19-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "{".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[}]") +Errors: Error 19-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "}".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[/]") +Errors: Error 19-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "/".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[|]") +Errors: Error 19-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "|".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[\]") +Errors: Error 19-24: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "\".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[a^]") +Errors: Error 19-25: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "^".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[^^]") +Errors: Error 19-25: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "^".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[^^^]") +Errors: Error 19-26: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "^".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[-]") +Errors: Error 19-24: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[-]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[--]") +Errors: Error 19-25: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[--]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[-a]") +Errors: Error 19-25: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[-a]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[a-]") +Errors: Error 19-25: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[a-]".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234", "[a-b]") +{FullMatch:"a",StartMatch:1} + +// Single is OK but double is not + +>> Match( "abcd1234***", "[*]") +{FullMatch:"*",StartMatch:9} + +// ErrInvalidRegExRepeatInCharClass +>> Match( "abcd1234***", "[**]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "**".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234+++", "[+]") +{FullMatch:"+",StartMatch:9} + +>> Match( "abcd1234+++", "[++]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "++".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234...", "[.]") +{FullMatch:".",StartMatch:9} + +>> Match( "abcd1234...", "[..]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "..".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234###", "[#]") +{FullMatch:"#",StartMatch:9} + +>> Match( "abcd1234###", "[##]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "##".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234$", "[$]") +{FullMatch:"$",StartMatch:9} + +>> Match( "abcd1234$$$", "[$$]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "$$".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234???", "[?]") +{FullMatch:"?",StartMatch:9} + +>> Match( "abcd1234???", "[??]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "??".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234&&&", "[&]") +{FullMatch:"&",StartMatch:9} + +>> Match( "abcd1234&&&", "[&&]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "&&".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234!!!", "[!]") +{FullMatch:"!",StartMatch:9} + +>> Match( "abcd1234!!!", "[!!]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "!!".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234%%%", "[%]") +{FullMatch:"%",StartMatch:9} + +>> Match( "abcd1234%%%", "[%%]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "%%".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234,,,", "[,]") +{FullMatch:",",StartMatch:9} + +>> Match( "abcd1234,,,", "[,,]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated ",,".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234:::", "[:]") +{FullMatch:":",StartMatch:9} + +>> Match( "abcd1234:::", "[::]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "::".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234;;;", "[;]") +{FullMatch:";",StartMatch:9} + +>> Match( "abcd1234;;;", "[;;]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated ";;".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234<<<", "[<]") +{FullMatch:"<",StartMatch:9} + +>> Match( "abcd1234<<<", "[<<]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "<<".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234===", "[=]") +{FullMatch:"=",StartMatch:9} + +>> Match( "abcd1234===", "[==]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated "==".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234>>>", "[>]") +{FullMatch:">",StartMatch:9} + +>> Match( "abcd1234>>>", "[>>]") +Errors: Error 22-28: Invalid regular expression: Character appears more than once in character class, found repeated ">>".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234@@@@", "[@]") +{FullMatch:"@",StartMatch:9} + +>> Match( "abcd1234@@@@", "[@@]") +Errors: Error 23-29: Invalid regular expression: Character appears more than once in character class, found repeated "@@".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234````", "[`]") +{FullMatch:"`",StartMatch:9} + +>> Match( "abcd1234````", "[``]") +Errors: Error 23-29: Invalid regular expression: Character appears more than once in character class, found repeated "``".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "abcd1234~~~~", "[~]") +{FullMatch:"~",StartMatch:9} + +>> Match( "abcd1234~~~~", "[~~]") +Errors: Error 23-29: Invalid regular expression: Character appears more than once in character class, found repeated "~~".|Error 0-5: The function 'Match' has some invalid arguments. + +// +// OTHER +// + +// Features supported by PCRE2 + +// ErrInvalidRegExBadParen +>> Match( "asdf", "(*MARK)") +Errors: Error 15-24: Invalid regular expression: Unsupported special group, found "(*M".|Error 0-5: The function 'Match' has some invalid arguments. + +// regular expression parsing + +>> Match( "test\123bed", "\\(\a)" ) +Errors: Error 22-30: Invalid regular expression: Invalid escape code, found "\a".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "test\123bed", "\\(\d)" ) +{FullMatch:"\1",StartMatch:5} + +>> IsMatch( "abc", "[-a]" ) +Errors: Error 16-22: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[-a]".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "[a-]" ) +Errors: Error 16-22: Invalid regular expression: Literal hyphen in character class must be escaped with backslash, escape with "\-", found in "[a-]".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "[a-b]+c" ) +true + +// ErrInvalidRegExBadEscapeInsideCharacterClass +>> IsMatch( "abc", "(a)[\1]") +Errors: Error 16-25: Invalid regular expression: Escape character not permitted within character class, found "\1".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "(?a)[\k]") +Errors: Error 16-32: Invalid regular expression: Escape character not permitted within character class, found "\k".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "[\b]") +Errors: Error 16-22: Invalid regular expression: Escape character not permitted within character class, found "\b".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "[\B]") +Errors: Error 16-22: Invalid regular expression: Escape character not permitted within character class, found "\B".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc", "[(]") +Errors: Error 16-21: Invalid regular expression: Literal character needs to be escaped with a backslash when used in a character class, found at the end of "(".|Error 0-7: The function 'IsMatch' has some invalid arguments. + +>> IsMatch( "abc()", "[\(]", MatchOptions.Contains ) +true + +// Basic comments, more in Match_Comments.txt + +>> Match( "asdf", "(?# asdf )" ) +{FullMatch:"",StartMatch:1} + +>> Match( "asdf", "d(?# asdf )" ) +{FullMatch:"d",StartMatch:3} + +// ErrInvalidRegExUnclosedInlineComment +>> Match( "asdf", "(?# asdf " ) +Errors: Error 15-26: Invalid regular expression: Unclosed inline comment, starts with "(?#...".|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "(?x) f # asdf " ) +{FullMatch:"f",StartMatch:4} + +// Pattern and options must be constant values + +>> Match( "asdf", "asdf" ) +{FullMatch:"asdf",StartMatch:1} + +// Concat of constants is OK +>> Match( "asdf", "as" & "df" ) +{FullMatch:"asdf",StartMatch:1} + +>> Match( "asdf", Concatenate( "as", "df" ) ) +{FullMatch:"asdf",StartMatch:1} + +// Concat function is not +>> Match( "asdf", Concat(["as", "df"], Value) ) +Errors: Error 15-42: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +// Concat with Code and UniCode is OK +>> Match( "asdf", Char(97) & "sd" & Char(102) ) +{FullMatch:"asdf",StartMatch:1} + +>> Match( "asdf", Char(65+32) & "sd" & Char(70+32) ) +Errors: Error 34-35: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrVariableRegEx +>> Match( "asdf", Text( "asdf" ) ) +Errors: Error 15-29: Regular expression must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrVariableRegExOptions +>> Match( "asdf", "asdf", If( Int(4) > -1, MatchOptions.IgnoreCase, MatchOptions.Contains ) ) +Errors: Error 23-88: MatchOptions must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "asdf", "asdf", If( Int(4) > -1, MatchOptions.IgnoreCase, Blank() ) ) +Errors: Error 23-74: MatchOptions must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +// ErrVariableRegEx +>> MatchAll( "asdf", Text( "asdf" ) ) +Errors: Error 18-32: Regular expression must be a constant value.|Error 0-8: The function 'MatchAll' has some invalid arguments. + +>> Match( "asDFASdf", "dfas", MatchOptions.IgnoreCase & MatchOptions.Contains ) +{FullMatch:"DFAS",StartMatch:3} + +>> Match( "asDFASdf", "dfas", Concatenate( MatchOptions.IgnoreCase, MatchOptions.Contains ) ) +{FullMatch:"DFAS",StartMatch:3} + +// ErrVariableRegExOptions +>> MatchAll( "asdf", "asdf", If( Int(4) > -1, MatchOptions.IgnoreCase, MatchOptions.Contains ) ) +Errors: Error 26-91: MatchOptions must be a constant value.|Error 0-8: The function 'MatchAll' has some invalid arguments. + +// ErrVariableRegEx +>> IsMatch( "asdf", Text( "asdf" ) ) +Errors: Error 17-31: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. + +// ErrVariableRegExOptions +>> IsMatch( "asdf", "asdf", If( Int(4) > -1, MatchOptions.IgnoreCase, MatchOptions.Contains ) ) +Errors: Error 25-90: MatchOptions must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Newlines.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Newlines.txt new file mode 100644 index 0000000000..d726d04df0 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Newlines.txt @@ -0,0 +1,313 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules,SupportColumnNamesAsIdentifiers + +// Newline and space character behavior in Power Fx regular expressions. +// +// Effective Usage .NET ECMAScript PCRE2 +// ===================================================================================================================================== +// (\r|\n|\r\n) ^ and $ in multline mode No No Yes (as configured by Excel) +// (\z|\r\z|\n\z|\r\n\z) ^ and $ when not in multline mode Yes No Yes (as configured by Excel) +// [^\n\r] . No No (close) Yes +// [ \r\n\f\v\t\x85\p{Z}] \s and \S Yes No (close) Yes + +// .NET will match up to the final newline, XRegExp and JavaScript do not + +>> AddColumns( Match( "test" & Char(10), "^test$" ), len, Len(FullMatch) ) +{FullMatch:"test",StartMatch:1,len:4} + +>> AddColumns( Match( "test" & Char(10) & Char(10), "^test$" ), len, Len(FullMatch) ) +Blank() + +>> AddColumns( Match( "test" & Char(13), "^test$" ), len, Len(FullMatch) ) +{FullMatch:"test",StartMatch:1,len:4} + +>> AddColumns( Match( "test" & Char(13) & Char(10), "^test$" ), len, Len(FullMatch) ) +{FullMatch:"test",StartMatch:1,len:4} + +// .NET treats dot as [^\n], XRegExp and JavaScript use [^\n\r\u2028\u2029] + +>> DropColumns( Match( "te" & " " & "t", "te.t" ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & Char(10) & "t", "te.t" ), FullMatch ) +Blank() + +>> DropColumns( Match( "te" & Char(13) & "t", "te.t" ), FullMatch ) +Blank() + +>> DropColumns( Match( "te" & UniChar(Hex2Dec("2028")) & "t", "te.t" ), FullMatch ) +{StartMatch:1} + +>> DropColumns( Match( "te" & UniChar(Hex2Dec("2029")) & "t", "te.t" ), FullMatch ) +{StartMatch:1} + +// $ end anchor, multiline, and newline characters + +>> MatchAll( "a1" & Char(10) & "b2" & Char(10) & "c3", "\d$" ) +Table({FullMatch:"3",StartMatch:8}) + +>> MatchAll( "a1" & Char(10) & "b2" & Char(10) & "c3", "\d$", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:2},{FullMatch:"2",StartMatch:5},{FullMatch:"3",StartMatch:8}) + +>> MatchAll( "a1" & Char(13) & "b2" & Char(10) & "c3", "\d$" ) +Table({FullMatch:"3",StartMatch:8}) + +>> MatchAll( "a1" & Char(13) & "b2" & Char(13) & "c3", "\d$", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:2},{FullMatch:"2",StartMatch:5},{FullMatch:"3",StartMatch:8}) + +>> MatchAll( "a1" & Char(13)&Char(10) & "b2" & Char(13)&Char(10) & "c3", "\d$" ) +Table({FullMatch:"3",StartMatch:10}) + +>> MatchAll( "a1" & Char(13)&Char(10) & "b2" & Char(13)&Char(10) & "c3", "\d$", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:2},{FullMatch:"2",StartMatch:6},{FullMatch:"3",StartMatch:10}) + +// ^ beginning anchor, multiline, and newline characters + +>> MatchAll( "1a" & Char(10) & "2b" & Char(10) & "3c", "^\d" ) +Table({FullMatch:"1",StartMatch:1}) + +>> MatchAll( "1a" & Char(10) & "2b" & Char(10) & "3c", "^\d", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:1},{FullMatch:"2",StartMatch:4},{FullMatch:"3",StartMatch:7}) + +>> MatchAll( "1a" & Char(13) & "2b" & Char(10) & "3c", "^\d" ) +Table({FullMatch:"1",StartMatch:1}) + +>> MatchAll( "1a" & Char(13) & "2b" & Char(13) & "3c", "^\d", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:1},{FullMatch:"2",StartMatch:4},{FullMatch:"3",StartMatch:7}) + +>> MatchAll( "1a" & Char(13)&Char(10) & "2b" & Char(13)&Char(10) & "3c", "^\d" ) +Table({FullMatch:"1",StartMatch:1}) + +>> MatchAll( "1a" & Char(13)&Char(10) & "2b" & Char(13)&Char(10) & "3c", "^\d", MatchOptions.Multiline ) +Table({FullMatch:"1",StartMatch:1},{FullMatch:"2",StartMatch:5},{FullMatch:"3",StartMatch:9}) + +>> ForAll( MatchAll( " + a + b + c + ", "^.+$"), { Match: FullMatch, Len: Len(FullMatch), Start: StartMatch } ) +Table() + +>> ForAll( MatchAll( " + a + b + c + ", "^.+$", MatchOptions.Multiline), { Match: FullMatch, Len: Len(FullMatch), Start: StartMatch } ) +Table({Len:2,Match:" a",Start:3},{Len:2,Match:" b",Start:7},{Len:2,Match:" c",Start:11},{Len:1,Match:" ",Start:15}) + +>> MatchAll( "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "\w" ) +Table({FullMatch:"a",StartMatch:1},{FullMatch:"b",StartMatch:3},{FullMatch:"c",StartMatch:5},{FullMatch:"c",StartMatch:6}) + +>> MatchAll( "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "\w", MatchOptions.Multiline ) +Table({FullMatch:"a",StartMatch:1},{FullMatch:"b",StartMatch:3},{FullMatch:"c",StartMatch:5},{FullMatch:"c",StartMatch:6}) + +>> MatchAll( "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "\w", MatchOptions.Multiline & MatchOptions.Complete ) +Table({FullMatch:"a",StartMatch:1},{FullMatch:"b",StartMatch:3}) + +>> MatchAll( "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "\w", MatchOptions.Complete ) +Table() + +>> MatchAll( Char(13) & "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "^\w", MatchOptions.Multiline ) +Table({FullMatch:"a",StartMatch:2},{FullMatch:"b",StartMatch:4},{FullMatch:"c",StartMatch:6}) + +>> MatchAll( Char(10) & "a" & Char(10) & "b" & Char(10) & "cc" & Char(10), "^\w", MatchOptions.Multiline ) +Table({FullMatch:"a",StartMatch:2},{FullMatch:"b",StartMatch:4},{FullMatch:"c",StartMatch:6}) + +>> MatchAll( Char(13) & "a" & Char(13) & "b" & Char(13) & "cc" & Char(13), "^\w" ) +Table() + +>> MatchAll( Char(10) & "a" & Char(10) & "b" & Char(10) & "cc" & Char(10), "^\w" ) +Table() + +>> MatchAll( "1a3" & Char(13) & "2b4" & Char(13), "(?m)\d$" ) +Table({FullMatch:"3",StartMatch:3},{FullMatch:"4",StartMatch:7}) + +>> MatchAll( "1a3" & Char(13) & "2b4" & Char(13), "\d$" ) +Table({FullMatch:"4",StartMatch:7}) + +>> Match( "1a3" & Char(13), "\d$" ) +{FullMatch:"3",StartMatch:3} + +>> MatchAll( "1a3" & Char(10) & "2b4" & Char(10), "(?m)\d$" ) +Table({FullMatch:"3",StartMatch:3},{FullMatch:"4",StartMatch:7}) + +>> MatchAll( "1a3" & Char(10) & "2b4" & Char(10), "\d$" ) +Table({FullMatch:"4",StartMatch:7}) + +>> Match( "1a3" & Char(10), "\d$" ) +{FullMatch:"3",StartMatch:3} + +>> MatchAll( "1a3" & Char(13)&Char(10) & "2b4" & Char(13)&Char(10), "(?m)\d$" ) +Table({FullMatch:"3",StartMatch:3},{FullMatch:"4",StartMatch:8}) + +>> MatchAll( "1a3" & Char(13)&Char(10) & "2b4" & Char(13)&Char(10), "\d$" ) +Table({FullMatch:"4",StartMatch:8}) + +>> Match( "1a3" & Char(13)&Char(10), "\d$" ) +{FullMatch:"3",StartMatch:3} + +// spaces + +>> IsMatch( "h" & "a" & "d", "h\sd") // control +false + +>> IsMatch( "h" & UniChar(Hex2Dec("0020")) & "d", "h\sd") // " " +true + +>> IsMatch( "h" & UniChar(Hex2Dec("000d")) & "d", "h\sd") // \r +true + +>> IsMatch( "h" & UniChar(Hex2Dec("000c")) & "d", "h\sd") // \f +true + +>> IsMatch( "h" & UniChar(Hex2Dec("000a")) & "d", "h\sd") // \n +true + +>> IsMatch( "h" & UniChar(Hex2Dec("0009")) & "d", "h\sd") // \t +true + +>> IsMatch( "h" & UniChar(Hex2Dec("000b")) & "d", "h\sd") // \v +true + +>> IsMatch( "h" & UniChar(Hex2Dec("0085")) & "d", "h\sd") // \x85, not in ECMAScript +true + +>> IsMatch( "h" & UniChar(Hex2Dec("1680")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2000")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2001")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2002")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2003")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2004")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2005")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2006")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2007")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2008")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2009")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("200a")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("202f")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("205f")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("3000")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2028")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("2029")) & "d", "h\sd") +true + +>> IsMatch( "h" & UniChar(Hex2Dec("feff")) & "d", "h\sd") // ECMAScript +false + +>> IsMatch( "h" & UniChar(Hex2Dec("00a0")) & "d", "h\sd") +true + +// non-space + +>> IsMatch( "h" & "a" & "d", "h\Sd") // control +true + +>> IsMatch( "h" & UniChar(Hex2Dec("0020")) & "d", "h\Sd") // " " +false + +>> IsMatch( "h" & UniChar(Hex2Dec("000d")) & "d", "h\Sd") // \r +false + +>> IsMatch( "h" & UniChar(Hex2Dec("000c")) & "d", "h\Sd") // \f +false + +>> IsMatch( "h" & UniChar(Hex2Dec("000a")) & "d", "h\Sd") // \n +false + +>> IsMatch( "h" & UniChar(Hex2Dec("0009")) & "d", "h\Sd") // \t +false + +>> IsMatch( "h" & UniChar(Hex2Dec("000b")) & "d", "h\Sd") // \v +false + +>> IsMatch( "h" & UniChar(Hex2Dec("0085")) & "d", "h\Sd") // \x85, not in ECMAScript +false + +>> IsMatch( "h" & UniChar(Hex2Dec("1680")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2000")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2001")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2002")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2003")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2004")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2005")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2006")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2007")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2008")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2009")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("200a")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("202f")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("205f")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("3000")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2028")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("2029")) & "d", "h\Sd") +false + +>> IsMatch( "h" & UniChar(Hex2Dec("feff")) & "d", "h\Sd") // ECMAScript +true + +>> IsMatch( "h" & UniChar(Hex2Dec("00a0")) & "d", "h\Sd") +false + diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_StronglyTypedEnumsDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_StronglyTypedEnumsDisabled.txt index ab1e431f60..91cff38391 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_StronglyTypedEnumsDisabled.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_StronglyTypedEnumsDisabled.txt @@ -1,4 +1,4 @@ #SETUP: RegEx,disable:StronglyTypedBuiltinEnums ->> Match("Hello", "Hello", "") -{FullMatch:"Hello",StartMatch:1,SubMatches:Table()} +>> Match("Hello", "Hello", "").FullMatch +"Hello" diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Unicode_Net462.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Unicode_Net462.txt new file mode 100644 index 0000000000..32fc5ff248 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_Unicode_Net462.txt @@ -0,0 +1,20 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules,SupportColumnNamesAsIdentifiers + +// Changes in case insensitive matching in .NET 7 causes different answers that are consistent with PCRE2 and Node +// See https://devblogs.microsoft.com/dotnet/regular-expression-improvements-in-dotnet-7/#case-insensitive-matching-and-regexoptions-ignorecase + +#DISABLE.NET:70 +>> Match( UniChar(Hex2Dec("03a9")), "\u2126", MatchOptions.IgnoreCase ).FullMatch +Blank() + +#DISABLE.NET:70 +>> Match( UniChar(Hex2Dec("03c9")), "\u2126", MatchOptions.IgnoreCase ).FullMatch +Blank() + +#DISABLE.NET:70 +>> Match( UniChar(Hex2Dec("2126")), "\u03c9", MatchOptions.IgnoreCase ).FullMatch +Blank() + +#DISABLE.NET:70 +>> Match( UniChar(Hex2Dec("2126")), "\u03a9", MatchOptions.IgnoreCase ).FullMatch +Blank() diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1Compat.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1Compat.txt new file mode 100644 index 0000000000..23bbc11d5d --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1Compat.txt @@ -0,0 +1,86 @@ +#SETUP: RegEx,PowerFxV1CompatibilityRules + +>> Match("Hello", "\w") +{FullMatch:"H",StartMatch:1} + +>> Index(Match("Hello", "(\w)l(\w)", MatchOptions.NumberedSubMatches).SubMatches, 1).Value +"e" + +>> Index(Match("Hello", "(\w)l(\w)", MatchOptions.NumberedSubMatches).SubMatches, 2).Value +"l" + +>> Concat(ForAll(Match("Hello", "(\w)l(\w)", MatchOptions.NumberedSubMatches).SubMatches, With({x:Value}, x)), Value, ", ") +"e, l" + +>> Match("Hello", "(\w)l(\w)", MatchOptions.NumberedSubMatches).SubMatches +Table({Value:"e"},{Value:"l"}) + +>> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete) +Blank() + +>> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline & MatchOptions.NumberedSubMatches) +{FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})} + +>> Match("JohnDoe@microsoft.com", Match.Email) +{FullMatch:"JohnDoe@microsoft.com",StartMatch:1} + +>> Match("(555) 123-4567", "^[\+]?[\(]?[0-9]{3}[\)]?[\-\s\.]?[0-9]{3}[\-\s\.]?[0-9]{4,6}$") +{FullMatch:"(555) 123-4567",StartMatch:1} + +>> Match("Hello", "Hello", MatchOptions.IgnoreCase) +{FullMatch:"Hello",StartMatch:1} + +>> Match("Hi", "Hi", MatchOptions.Multiline) +{FullMatch:"Hi",StartMatch:1} + +>> Match("28", "28", Blank()) +Errors: Error 18-25: MatchOptions must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match("28", "28", 28) +Errors: Error 18-20: MatchOptions must be a constant value.|Error 0-5: The function 'Match' has some invalid arguments. + +>> Match( "a", "((a)|(b))", MatchOptions.NumberedSubMatches ) +{FullMatch:"a",StartMatch:1,SubMatches:Table({Value:"a"},{Value:"a"},{Value:Blank()})} + +>> Match( "b", "((a)|(b))", MatchOptions.NumberedSubMatches ) +{FullMatch:"b",StartMatch:1,SubMatches:Table({Value:"b"},{Value:Blank()},{Value:"b"})} + +>> Match( "c", "((a)|(b))", MatchOptions.NumberedSubMatches ) +Blank() + +>> Match( "a", "(?(?a)|(?b))" ) +{FullMatch:"a",StartMatch:1,alpha:"a",beta:Blank(),whole:"a"} + +>> Match( "b", "(?(?a)|(?b))" ) +{FullMatch:"b",StartMatch:1,alpha:Blank(),beta:"b",whole:"b"} + +>> Match( "c", "(?(?a)|(?b))" ) +Blank() + +// Match( "a", "(b*)*", MatchOptions.NumberedSubMatches ) is ambiguous, with different results for .NET and JavaScript + +>> Match( "a", "(b)*", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:Blank()})} + +>> Match( "a", "(b*)", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:""})} + +>> Match( "a", "(b)?", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:Blank()})} + +>> Match( "a", "(b?)", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:""})} + +>> Match( "a", "(b){0,4}", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:Blank()})} + +>> Match( "a", "(b{0,4})", MatchOptions.NumberedSubMatches ) +{FullMatch:"",StartMatch:1,SubMatches:Table({Value:""})} + +// Match( "ab", "(a)*b\1" , MatchOptions.NumberedSubMatches ) is ambiguous, with different results for .NET and JavaScript + +>> Match( "ab", "(a)*b" , MatchOptions.NumberedSubMatches ) +{FullMatch:"ab",StartMatch:1,SubMatches:Table({Value:"a"})} + +>> Match( "ab", "(a*)b\1" , MatchOptions.NumberedSubMatches ) +{FullMatch:"b",StartMatch:2,SubMatches:Table({Value:""})} diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1CompatDisabled.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1CompatDisabled.txt new file mode 100644 index 0000000000..0228c842ff --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/Match_V1CompatDisabled.txt @@ -0,0 +1,38 @@ +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +>> Match("Hello", "\w") +{FullMatch:"H",StartMatch:1,SubMatches:Table()} + +>> Match("Hello", "llo", MatchOptions.Complete).SubMatches +Blank() + +>> Concat(ForAll(Match( "Bob Jones ", "<(?" & Match.Email & ")>").SubMatches, With({x:Value}, x)), Value, ", ") +"bob.jones@contoso.com" + +>> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 1).Value +"e" + +>> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 2).Value +"l" + +>> Concat(ForAll(Match("Hello", "(?\w)l(?\w)").SubMatches, With({x:Value}, x)), Value, ", ") +"e, l" + +>> Match("Hello", "(?\w)l(?\w)").SubMatches +Table({Value:"e"},{Value:"l"}) + +>> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) +{FullMatch:"Joe 164",StartMatch:1,SubMatches:Table({Value:"Joe"},{Value:"164"})} + +>> Match("JohnDoe@microsoft.com", Match.Email) +{FullMatch:"JohnDoe@microsoft.com",StartMatch:1,SubMatches:Table()} + +>> Match("(555) 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +{FullMatch:"(555) 123-4567",StartMatch:1,SubMatches:Table()} + +>> Match("Hello", "Hello", MatchOptions.IgnoreCase) +{FullMatch:"Hello",StartMatch:1,SubMatches:Table()} + +>> Match("Hi", "Hi", MatchOptions.Multiline) +{FullMatch:"Hi",StartMatch:1,SubMatches:Table()} + diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums.txt index 6b0b9cc17d..f9924ec662 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums.txt @@ -49,7 +49,7 @@ Table({Value:1},{Value:2},{Value:3}) >> Match( "info@contoso.com", Match.Email ) -{FullMatch:"info@contoso.com",StartMatch:1,SubMatches:Table()} +{FullMatch:"info@contoso.com",StartMatch:1} >> JSON( [1,2,3], JSONFormat.FlattenValueTables ) "[1,2,3]" @@ -418,10 +418,10 @@ Errors: Error 0-7: The function 'Boolean' has some invalid arguments. // Match (and friends) allows coercion from the backing kind for the regular expression >> Match( "a3d4", Match.Digit ) -{FullMatch:"3",StartMatch:2,SubMatches:Table()} +{FullMatch:"3",StartMatch:2} >> Match( "a3d4", "\d") -{FullMatch:"3",StartMatch:2,SubMatches:Table()} +{FullMatch:"3",StartMatch:2} >> IsMatch( "a3d4", Match.Digit ) false @@ -430,10 +430,10 @@ false false >> MatchAll( "a3d4", Match.Digit ) -Table({FullMatch:"3",StartMatch:2,SubMatches:Table()},{FullMatch:"4",StartMatch:4,SubMatches:Table()}) +Table({FullMatch:"3",StartMatch:2},{FullMatch:"4",StartMatch:4}) >> MatchAll( "a3d4", "\d") -Table({FullMatch:"3",StartMatch:2,SubMatches:Table()},{FullMatch:"4",StartMatch:4,SubMatches:Table()}) +Table({FullMatch:"3",StartMatch:2},{FullMatch:"4",StartMatch:4}) >> Match.Digit = "\d" true @@ -531,7 +531,7 @@ Errors: Error 34-35: Invalid argument type (Text). Expecting a Enum (JSONFormat) // Since the Match supports CanCoerceBackingKind, any concatenation combination is supported >> Match( "334", Match.Digit & Match.Digit & JSONFormat.IndentFour ) -{FullMatch:"334",StartMatch:1,SubMatches:Table()} +{FullMatch:"334",StartMatch:1} // Concatenation can be allowed between members of the option set and still retain strong typing with IExternalOptionSet.CanConcatenateStronglyTyped @@ -553,16 +553,16 @@ Errors: Error 15-18: Invalid argument type (Text). Expecting a Enum (JSONFormat) // Concatenation can be allowed with text strings and still retain strong typing with IExternalOptionSet.CanCoerceBackingKind >> Match( "33this is ok", Concatenate( Match.Digit, Match.Digit, "this is ok" ) ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +{FullMatch:"33this is ok",StartMatch:1} >> Match( "33this is ok", Match.Digit & Match.Digit & "this is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +{FullMatch:"33this is ok",StartMatch:1} >> Match( "33this is ok", "\d\dthis is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +{FullMatch:"33this is ok",StartMatch:1} >> Match( "33this is ok", "\d" & "\d" & "this is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +{FullMatch:"33this is ok",StartMatch:1} // Strongly typed concatenated results can be compared @@ -617,16 +617,16 @@ true // >> Match( "a34d", Match.Digit & "\d" ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +{FullMatch:"34",StartMatch:2} >> Match( "a34d", "\d" & Match.Digit ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +{FullMatch:"34",StartMatch:2} >> Match( "a34d", Concatenate( Match.Digit, "\d" ) ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +{FullMatch:"34",StartMatch:2} >> Match( "a34d", Concatenate( "\d", Match.Digit ) ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +{FullMatch:"34",StartMatch:2} >> Text(Match.Digit) & Text(SortOrder.Ascending) "\dascending" diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums_PreV1.txt b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums_PreV1.txt index 6413bf7f79..64951c6d06 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums_PreV1.txt +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestCases/StronglyTypedEnum_BuiltInEnums_PreV1.txt @@ -50,8 +50,8 @@ >> Sort( [1,2,3], SortOrder.Descending ) Table({Value:1},{Value:2},{Value:3}) ->> Match( "info@contoso.com", Match.Email ) -{FullMatch:"info@contoso.com",StartMatch:1,SubMatches:Table()} +>> Match( "info@contoso.com", Match.Email ).FullMatch +"info@contoso.com" >> JSON( [1,2,3], JSONFormat.FlattenValueTables ) "[1,2,3]" @@ -88,8 +88,8 @@ Blank() >> Match( "hi", JSONFormat.IndentFour ) Blank() ->> Match( "hi", "h.*", JSONFormat.IndentFour ) -{FullMatch:"hi",StartMatch:1,SubMatches:Table()} +>> Match( "hi", "h.*", JSONFormat.IndentFour ).FullMatch +"hi" >> Match( "hi", Match.Email, JSONFormat.IndentFour ) Blank() @@ -106,8 +106,8 @@ false >> MatchAll( "hi", JSONFormat.IndentFour ) Table() ->> MatchAll( "hi", "h.*", JSONFormat.IndentFour ) -Table({FullMatch:"hi",StartMatch:1,SubMatches:Table()}) +>> ForAll( MatchAll( "hiha", "h", JSONFormat.IndentFour ), {fm:FullMatch, sm:StartMatch} ) +Table({fm:"h",sm:1},{fm:"h",sm:3}) >> MatchAll( "hi", Match.Email, JSONFormat.IndentFour ) Table() @@ -169,14 +169,14 @@ Table({Value:3},{Value:2},{Value:1}) >> JSON( [1,2,3], "_" ) "[1,2,3]" ->> Match( "howdy", "h", "c" ) -{FullMatch:"h",StartMatch:1,SubMatches:Table()} +>> Match( "howdy", "h", "c" ).FullMatch +"h" >> IsMatch("Foo", 17) -Errors: Error 15-17: Regular expressions must be constant values. +Errors: Error 15-17: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. >> IsMatch("Foo", 1/0) -Errors: Error 16-17: Regular expressions must be constant values. +Errors: Error 16-17: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. //=========================================================================================================== // @@ -374,11 +374,11 @@ Error({Kind:ErrorKind.InvalidArgument}) // Match (and friends) allows coercion from the backing kind for the regular expression ->> Match( "a3d4", Match.Digit ) -{FullMatch:"3",StartMatch:2,SubMatches:Table()} +>> Match( "a3d4", Match.Digit ).FullMatch +"3" ->> Match( "a3d4", "\d") -{FullMatch:"3",StartMatch:2,SubMatches:Table()} +>> Match( "a3d4", "\d").FullMatch +"3" >> IsMatch( "a3d4", Match.Digit ) false @@ -386,11 +386,11 @@ false >> IsMatch( "a3d4", "\d") false ->> MatchAll( "a3d4", Match.Digit ) -Table({FullMatch:"3",StartMatch:2,SubMatches:Table()},{FullMatch:"4",StartMatch:4,SubMatches:Table()}) +>> ForAll( MatchAll( "a3d4", Match.Digit ), {fm:FullMatch} ) +Table({fm:"3"},{fm:"4"}) ->> MatchAll( "a3d4", "\d") -Table({FullMatch:"3",StartMatch:2,SubMatches:Table()},{FullMatch:"4",StartMatch:4,SubMatches:Table()}) +>> ForAll( MatchAll( "a3d4", "\d"), {fm:FullMatch} ) +Table({fm:"3"},{fm:"4"}) >> Match.Digit = "\d" true @@ -484,8 +484,8 @@ Table({Value:1},{Value:2},{Value:3}) "[{""Value"":1},{""Value"":2},{""Value"":3}]" // Since the Match supports CanCoerceBackingKind, any concatenation combination is supported ->> Match( "334", Match.Digit & Match.Digit & JSONFormat.IndentFour ) -{FullMatch:"334",StartMatch:1,SubMatches:Table()} +>> Match( "334", Match.Digit & Match.Digit & JSONFormat.IndentFour ).FullMatch +"334" // Concatenation can be allowed between members of hte option set and still retain strong typing with IExternalOptionSet.CanConcatenateStronglyTyped @@ -506,17 +506,17 @@ Table({Value:1},{Value:2},{Value:3}) // Concatenation can be allowed with text strings and still retain strong typing with IExternalOptionSet.CanCoerceBackingKind ->> Match( "33this is ok", Concatenate( Match.Digit, Match.Digit, "this is ok" ) ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +>> Match( "33this is ok", Concatenate( Match.Digit, Match.Digit, "this is ok" ) ).FullMatch +"33this is ok" ->> Match( "33this is ok", Match.Digit & Match.Digit & "this is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +>> Match( "33this is ok", Match.Digit & Match.Digit & "this is ok" ).FullMatch +"33this is ok" ->> Match( "33this is ok", "\d\dthis is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +>> Match( "33this is ok", "\d\dthis is ok" ).FullMatch +"33this is ok" ->> Match( "33this is ok", "\d" & "\d" & "this is ok" ) -{FullMatch:"33this is ok",StartMatch:1,SubMatches:Table()} +>> Match( "33this is ok", "\d" & "\d" & "this is ok" ).FullMatch +"33this is ok" // Strongly typed concatenated results can be compared @@ -572,17 +572,17 @@ true // 14. CanConcatenateStronglyTyped & CanCoerceFromBackingKind - An important combination, used by Match, allows strings and enums to be mixed // ->> Match( "a34d", Match.Digit & "\d" ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +>> Match( "a34d", Match.Digit & "\d" ).FullMatch +"34" ->> Match( "a34d", "\d" & Match.Digit ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +>> Match( "a34d", "\d" & Match.Digit ).FullMatch +"34" ->> Match( "a34d", Concatenate( Match.Digit, "\d" ) ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +>> Match( "a34d", Concatenate( Match.Digit, "\d" ) ).FullMatch +"34" ->> Match( "a34d", Concatenate( "\d", Match.Digit ) ) -{FullMatch:"34",StartMatch:2,SubMatches:Table()} +>> Match( "a34d", Concatenate( "\d", Match.Digit ) ).FullMatch +"34" >> Text(Match.Digit) & Text(SortOrder.Ascending) "\dascending" @@ -1451,4 +1451,4 @@ true false >> IsMatch( "28", 28 ) -Errors: Error 15-17: Regular expressions must be constant values. +Errors: Error 15-17: Regular expression must be a constant value.|Error 0-7: The function 'IsMatch' has some invalid arguments. diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/BaseRunner.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/BaseRunner.cs index 17f8c60519..79937af2a7 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/BaseRunner.cs +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/BaseRunner.cs @@ -213,7 +213,8 @@ public abstract class BaseRunner var expectedCompilerError = expected.StartsWith("Errors: Error") || expected.StartsWith("Errors: Warning"); // $$$ Match error message. if (expectedCompilerError) { - string[] expectedStrArr = expected.Replace("Errors: ", string.Empty).Split('|'); + // regex used to support error messages that contain a | + string[] expectedStrArr = Regex.Matches(expected.Replace("Errors: ", string.Empty), "(\".*\"|[^\\|])+").Select(exp => exp.ToString()).ToArray(); string[] actualStrArr = runResult.Errors.Select(err => err.ToString()).ToArray(); bool isValid = true; diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TestRunner.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TestRunner.cs index ef33b4c3be..8685d709ed 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TestRunner.cs +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TestRunner.cs @@ -102,6 +102,8 @@ public static Dictionary ParseSetupString(string setup) possible.Add("RegEx"); possible.Add("TimeZoneInfo"); possible.Add("TraceSetup"); + possible.Add("CultureInfo"); + possible.Add("Net7"); foreach (Match match in Regex.Matches(setup, @"(disable:)?(([\w]+|//)(\([^\)]*\))?)")) { @@ -134,25 +136,25 @@ public static Dictionary ParseSetupString(string setup) return settings; } - public void AddDir(Dictionary setup, string directory = "") + public void AddDir(Dictionary setup, Dictionary requiredSetup, string directory = "") { directory = GetFullPath(directory, TestRoot); var allFiles = Directory.EnumerateFiles(directory); - AddFile(setup, allFiles); + AddFile(setup, requiredSetup, allFiles); } - public void AddFile(Dictionary setup, params string[] files) + public void AddFile(Dictionary setup, Dictionary requiredSetup, params string[] files) { var x = (IEnumerable)files; - AddFile(setup, x); + AddFile(setup, requiredSetup, x); } - public void AddFile(Dictionary setup, IEnumerable files) + public void AddFile(Dictionary setup, Dictionary requiredSetup, IEnumerable files) { foreach (var file in files) { - AddFile(setup, file); + AddFile(setup, requiredSetup, file); } } @@ -179,7 +181,7 @@ private static bool TryParseDirective(string line, string directive, out string } } - public void AddFile(Dictionary setup, string thisFile) + public void AddFile(Dictionary setup, Dictionary requiredSetup, string thisFile) { thisFile = GetFullPath(thisFile, TestRoot); @@ -264,7 +266,19 @@ public void AddFile(Dictionary setup, string thisFile) { return; } - } + } + + // If requiredSetup is supplied, then those setup elements must be present and agree + if (requiredSetup != null) + { + foreach (var flag in requiredSetup) + { + if (!fileSetupDict.ContainsKey(flag.Key) || flag.Value != fileSetupDict[flag.Key]) + { + return; + } + } + } fileSetup = string.Join(",", fileSetupDict.Select(i => (i.Value ? string.Empty : "disable:") + i.Key)); diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TxtFileDataAttribute.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TxtFileDataAttribute.cs index 7dd826dcbb..6d0a91e12a 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TxtFileDataAttribute.cs +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/ExpressionTestHelpers/TxtFileDataAttribute.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Reflection; using Xunit.Sdk; @@ -21,13 +22,18 @@ public class TxtFileDataAttribute : DataAttribute private readonly string _filePathSpecific; private readonly string _engineName; private readonly Dictionary _setup; + private readonly Dictionary _requiredSetup; - public TxtFileDataAttribute(string filePathCommon, string filePathSpecific, string engineName, string setup) + public TxtFileDataAttribute(string filePathCommon, string filePathSpecific, string engineName, string setup, string requiredSetup = null) { _filePathCommon = filePathCommon; _filePathSpecific = filePathSpecific; _engineName = engineName; _setup = TestRunner.ParseSetupString(setup); + if (requiredSetup != null) + { + _requiredSetup = TestRunner.ParseSetupString(requiredSetup); + } } public override IEnumerable GetData(MethodInfo testMethod) @@ -58,7 +64,7 @@ public override IEnumerable GetData(MethodInfo testMethod) if (file.EndsWith(".txt", StringComparison.InvariantCultureIgnoreCase)) { - parser.AddFile(_setup, file); + parser.AddFile(_setup, _requiredSetup, file); } } } diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/Helpers/RegEx_JavaScript.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/Helpers/RegEx_JavaScript.cs new file mode 100644 index 0000000000..3e452f72a9 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/Helpers/RegEx_JavaScript.cs @@ -0,0 +1,238 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// This file contains the source for AlterRegex in JavaScript. +// It is included here so that the test suite can compare results from .NET, JavaScript, and PCRE2. +// It is here in the Core library so that it can be extracted in the Canvas build and compared against the version stored there. + +namespace Microsoft.PowerFx.Functions +{ + public class RegEx_JavaScript + { + // This JavaScript function assumes that the regular expression has already been compiled and comforms to the Power Fx regular expression language. + // For example, no affodance is made for nested character classes or inline options on a subexpression, as those would have already been blocked. + // Stick to single ticks for strings to keep this easier to read and maintain here in C#. + public const string AlterRegex_JavaScript = @" + function AlterRegex_JavaScript(regex, flags) + { + var index = 0; + + const inlineFlagsRE = /^\(\?(?[imnsx]+)\)/; + const inlineFlags = inlineFlagsRE.exec( regex ); + if (inlineFlags != null) + { + flags = flags.concat(inlineFlags.groups['flags']); + index = inlineFlags[0].length; + } + + const freeSpacing = flags.includes('x'); + const multiline = flags.includes('m'); + const dotAll = flags.includes('s'); + const ignoreCase = flags.includes('i'); + const numberedSubMatches = flags.includes('N'); + + // rebuilding from booleans avoids possible duplicate letters + // x has been handled in this function and does not need to be passed on (and would cause an error) + const alteredFlags = 'v'.concat((ignoreCase ? 'i' : ''), (multiline ? 'm' : ''), (dotAll ? 's' : '')); + + var openCharacterClass = false; // are we defining a character class? + var altered = ''; + var spaceWaiting = false; + var mainCharacterClass = ''; + var orCharacterClass = ''; + + for ( ; index < regex.length; index++) + { + var alteredToken = ''; + + switch (regex.charAt(index) ) + { + case '[': + openCharacterClass = true; + mainCharacterClass = ''; + orCharacterClass = ''; + spaceWaiting = false; + break; + + case ']': + openCharacterClass = false; + if (mainCharacterClass != '' && orCharacterClass != '') + altered = altered.concat('(?:[', mainCharacterClass, ']', orCharacterClass, ')'); + else if(mainCharacterClass != '') + altered = altered.concat('[', mainCharacterClass, ']'); + else + altered = altered.concat(orCharacterClass.substring(1)); // strip leading '|' deliniator + spaceWaiting = false; + break; + + case '\\': + if (++index < regex.length) + { + const wordChar = '\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Lm}\\p{Mn}\\p{Nd}\\p{Pc}'; + const spaceChar = '\\f\\n\\r\\t\\v\\x85\\p{Z}'; + const digitChar = '\\p{Nd}'; + + switch (regex.charAt(index)) + { + case 'w': + alteredToken = ''.concat(openCharacterClass ? '' : '[', wordChar, openCharacterClass ? '' : ']'); + break; + case 'W': + if (openCharacterClass) + orCharacterClass = orCharacterClass.concat( '|[^', wordChar, ']' ); + else + alteredToken = ''.concat('[^', wordChar, ']'); + break; + + case 'b': + alteredToken = `(?:(?<=[${wordChar}])(?![${wordChar}])|(? 2 && regex.charAt(index+1) == '?' && regex.charAt(index+2) == '#') + { + // inline comment + for ( index++; index < regex.length && regex.charAt(index) != ')'; index++) + { + // eat characters until a close paren, it doesn't matter if it is escaped (consistent with .NET) + } + + spaceWaiting = true; + } + else + { + alteredToken = '('; + spaceWaiting = false; + } + + break; + + case ' ': case '\f': case '\n': case '\r': case '\t': + if (freeSpacing && !openCharacterClass) + { + spaceWaiting = true; + } + else + { + alteredToken = regex.charAt(index); + spaceWaiting = false; + } + + break; + + case '#': + if (freeSpacing && !openCharacterClass) + { + for ( index++; index < regex.length && regex.charAt(index) != '\r' && regex.charAt(index) != '\n'; index++) + { + // eat characters until the end of the line + // leaving dangling whitespace characters will be eaten on next iteration + } + + spaceWaiting = true; + } + else + { + alteredToken = '#'; + spaceWaiting = false; + } + + break; + + case '*': case '+': case '?': case '{': + if (spaceWaiting && altered.length > 0 && altered.charAt(altered.length-1) == '(') + { + alteredToken = '(?:)'; + spaceWaiting = false; + } + alteredToken = alteredToken.concat(regex.charAt(index)); + spaceWaiting = false; + break; + + default: + if (spaceWaiting) + { + alteredToken = '(?:)'; + spaceWaiting = false; + } + alteredToken = alteredToken.concat(regex.charAt(index)); + break; + } + + if (openCharacterClass) + mainCharacterClass = mainCharacterClass.concat(alteredToken); + else + altered = altered.concat(alteredToken); + } + + if (flags.includes('^')) + { + altered = '^' + altered; + } + + if (flags.includes('$')) + { + altered = altered + '$'; + } + + return [altered, alteredFlags]; + } + "; + } +} diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/InternalSetup.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/InternalSetup.cs index 7301c0178f..224fb7f354 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/InternalSetup.cs +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/InternalSetup.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Globalization; using System.Linq; using System.Text.RegularExpressions; using Microsoft.PowerFx.Core.Parser; @@ -19,6 +20,8 @@ internal class InternalSetup internal TimeZoneInfo TimeZoneInfo { get; set; } + internal CultureInfo CultureInfo { get; set; } + /// /// By default, we run expressions with a memory governor to enforce a limited amount of memory. /// When true, disable memory checks and allow expression to use as much memory as it needs. @@ -107,6 +110,11 @@ internal static InternalSetup Parse(string setupHandlerName, Features features, if (string.Equals(part, "DisableMemChecks", StringComparison.OrdinalIgnoreCase)) { + if (isDisable) + { + throw new ArgumentException("Invalid DisableMemChecks setup!"); + } + iSetup.DisableMemoryChecks = true; parts.Remove(part); } @@ -129,6 +137,11 @@ internal static InternalSetup Parse(string setupHandlerName, Features features, } else if (part.StartsWith("TimeZoneInfo", StringComparison.OrdinalIgnoreCase)) { + if (isDisable) + { + throw new ArgumentException("Invalid TimeZoneInfo setup!"); + } + var m = new Regex(@"TimeZoneInfo\(""(?[^)]+)""\)", RegexOptions.IgnoreCase).Match(part); if (m.Success) @@ -144,6 +157,28 @@ internal static InternalSetup Parse(string setupHandlerName, Features features, throw new ArgumentException("Invalid TimeZoneInfo setup!"); } } + else if (part.StartsWith("CultureInfo", StringComparison.OrdinalIgnoreCase)) + { + if (isDisable) + { + throw new ArgumentException("Invalid CultureInfo setup!"); + } + + var m = new Regex(@"CultureInfo\(""(?[^)]+)""\)", RegexOptions.IgnoreCase).Match(part); + + if (m.Success) + { + var culture = m.Groups["culture"].Value; + + // This call will throw if the Language tag in invalid + iSetup.CultureInfo = new CultureInfo(culture); + parts.Remove(part); + } + else + { + throw new ArgumentException("Invalid CultureInfo setup!"); + } + } } iSetup.HandlerNames = parts; diff --git a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/TestRunnerTests.cs b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/TestRunnerTests.cs index a929452ea6..f7d6a15805 100644 --- a/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/TestRunnerTests.cs +++ b/src/tests/Microsoft.PowerFx.Core.Tests.Shared/TestRunnerTests/TestRunnerTests.cs @@ -533,7 +533,7 @@ public void TestErrorOverride3() private static void AddFile(TestRunner runner, string filename) { var test1 = GetFullPath(filename, TxtFileDataAttribute.GetDefaultTestDir("TestRunnerTests")); - runner.AddFile(TestRunner.ParseSetupString(string.Empty), test1); + runner.AddFile(TestRunner.ParseSetupString(string.Empty), null, test1); } } } diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/FileExpressionEvaluationTests.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/FileExpressionEvaluationTests.cs index ed3f5348ca..0f1bea5408 100644 --- a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/FileExpressionEvaluationTests.cs +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/FileExpressionEvaluationTests.cs @@ -93,6 +93,29 @@ public void None_Float(ExpressionTestCase testCase) } #endif +#if MATCHCOMPARE + // to enable, place this in Solution Items/Directiory.Build.Props: + // + // $(DefineConstants);MATCHCOMPARE + // + +#if true // may not want to run this, even if MATCHCOMPARE is enabled + + // Runs only tests that have asked for RegEx setup. This test run will compare the regular expression results between + // .NET (used in the C# interpreter), NodeJS with JavaScript (used in Canvas), and PCRE2 (used in Excel). + // This is not run all the time. It requires Node to be installed and PCRE2 built as a shared library DLL and on the path. + [TxtFileData("ExpressionTestCases", "InterpreterExpressionTestCases", nameof(InterpreterRunner), "PowerFxV1,disable:NumberIsFloat,DecimalSupport", "RegEx")] + [InterpreterTheory] + public void RegExCompare(ExpressionTestCase t) + { + ExpressionEvaluationTests.RegExCompareNode = true; + ExpressionEvaluationTests.RegExComparePCRE2 = true; + + RunExpressionTestCase(t, Features.PowerFxV1, numberIsFloat: false, Console); + } +#endif +#endif + private static string _currentNetVersion = null; private static readonly object _cnvLock = new object(); @@ -156,34 +179,45 @@ private static bool ShouldSkipDotNetVersion(ExpressionTestCase testCase, string } #if false - // Helper to run a single .txt + // Helper to run a single .txt [Fact] public void RunOne() { - var path = @"D:\repos\osp1\src\tests\Microsoft.PowerFx.Core.Tests\ExpressionTestCases\StronglyTypedEnum_TestEnums_PreV1.txt"; + var path = @"c:\temp\match_unicode.txt"; var line = 0; var runner = new InterpreterRunner(); var testRunner = new TestRunner(runner); - testRunner.AddFile(new Dictionary(), path); + testRunner.AddFile(new Dictionary(), null, path); - // We can filter to just cases we want, set line above + // We can filter to just cases we want, set line above a if (line > 0) { testRunner.Tests.RemoveAll(x => x.SourceLine != line); } - + var result = testRunner.RunTests(); if (result.Fail > 0) { - Assert.True(false, result.Output); + Assert.Fail(result.Output); } else { Console.WriteLine(result.Output); } } + +#if MATCHCOMPARE + // Helper to run a single .txt with regular expression comparison between .NET, Node, and PCRE2 + [Fact] + public void RunOneMatchCompare() + { + ExpressionEvaluationTests.RegExCompareNode = true; + ExpressionEvaluationTests.RegExComparePCRE2 = false; + RunOne(); + } +#endif #endif // Run cases in MutationScripts @@ -236,7 +270,7 @@ private void RunMutationTestFile(string file, Features features, string setup) var testRunner = new TestRunner(runner); - testRunner.AddFile(TestRunner.ParseSetupString(setup), path); + testRunner.AddFile(TestRunner.ParseSetupString(setup), null, path); if (testRunner.Tests.Count > 0 && testRunner.Tests[0].SetupHandlerName.Contains("MutationFunctionsTestSetup")) { @@ -297,7 +331,7 @@ public void ScanNotYetReadyForTxtParseErrors() var runner = new TestRunner(); // Verify this runs without throwing an exception. - runner.AddDir(new Dictionary(), path); + runner.AddDir(new Dictionary(), null, path); // Ensure that we actually found tests and not pointed to an empty directory Assert.True(runner.Tests.Count > 10); diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/AsyncVerify.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/AsyncVerify.cs index 0cc4d0bfc2..2c91f9112d 100644 --- a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/AsyncVerify.cs +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/AsyncVerify.cs @@ -83,6 +83,11 @@ public async Task EvalAsync(RecalcEngine engine, string expr, Inte rtConfig.AddService(setup.TimeZoneInfo); } + if (setup.CultureInfo != null) + { + rtConfig.AddService(setup.CultureInfo); + } + var task = engine.EvalAsync(expr, CancellationToken.None, options: setup.Flags.ToParserOptions(new CultureInfo("en-US")), runtimeConfig: rtConfig); var i = 0; diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_Compare.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_Compare.cs new file mode 100644 index 0000000000..ff8462c818 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_Compare.cs @@ -0,0 +1,214 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#if MATCHCOMPARE + +// This file compares the results from .NET, PCRE2, and NODEJS. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using Microsoft.PowerFx.Core.Functions; +using Microsoft.PowerFx.Core.Texl.Builtins; +using Microsoft.PowerFx.Types; +using static Microsoft.PowerFx.Functions.RegEx_NodeJS; +using static Microsoft.PowerFx.Functions.RegEx_PCRE2; + +namespace Microsoft.PowerFx.Functions +{ + public class RegEx_Compare + { + public static void EnableRegExFunctions(PowerFxConfig config, TimeSpan regExTimeout = default, int regexCacheSize = -1, bool includeNode = true, bool includePCRE2 = true) + { + RegexTypeCache regexTypeCache = new (regexCacheSize); + + foreach (KeyValuePair func in RegexFunctions(regExTimeout, regexTypeCache, includeNode, includePCRE2)) + { + if (config.ComposedConfigSymbols.Functions.AnyWithName(func.Key.Name)) + { + throw new InvalidOperationException("Cannot add RegEx functions more than once."); + } + + config.InternalConfigSymbols.AddFunction(func.Key); + config.AdditionalFunctions.Add(func.Key, func.Value); + } + } + + internal static Dictionary RegexFunctions(TimeSpan regexTimeout, RegexTypeCache regexCache, bool includeNode, bool includePCRE2) + { + if (regexTimeout == TimeSpan.Zero) + { + regexTimeout = new TimeSpan(0, 0, 1); + } + + if (regexTimeout.TotalMilliseconds < 0) + { + throw new ArgumentOutOfRangeException(nameof(regexTimeout), "Timeout duration for regular expression execution must be positive."); + } + + return new Dictionary() + { + { new IsMatchFunction(), new Compare_IsMatchImplementation(regexTimeout, includeNode, includePCRE2) }, + { new MatchFunction(regexCache), new Compare_MatchImplementation(regexTimeout, includeNode, includePCRE2) }, + { new MatchAllFunction(regexCache), new Compare_MatchAllImplementation(regexTimeout, includeNode, includePCRE2) } + }; + } + + internal abstract class Compare_CommonImplementation : Library.RegexCommonImplementation + { + protected Library.RegexCommonImplementation dotnet; + protected Library.RegexCommonImplementation node; + protected Library.RegexCommonImplementation pcre2; + + protected Library.RegexCommonImplementation dotnet_alt; + protected Library.RegexCommonImplementation node_alt; + protected Library.RegexCommonImplementation pcre2_alt; + + private string CharCodes(string text) + { + StringBuilder sb = new StringBuilder(); + + foreach (char c in text) + { + sb.Append(Convert.ToInt32(c).ToString("X4")); + sb.Append(" "); + } + + if (sb.Length > 0) + { + return sb.ToString().Substring(0, sb.Length - 1); + } + else + { + return string.Empty; + } + } + + private FormulaValue InvokeRegexFunctionOne(string input, string regex, string options, Library.RegexCommonImplementation dotnet, Library.RegexCommonImplementation node, Library.RegexCommonImplementation pcre2, string kind) + { + var dotnetMatch = dotnet.InvokeRegexFunction(input, regex, options); + var dotnetExpr = dotnetMatch.ToExpression(); + + string nodeExpr = null; + string pcre2Expr = null; + + if (node != null) + { + var nodeMatch = node.InvokeRegexFunction(input, regex, options); + nodeExpr = nodeMatch.ToExpression(); + } + + if (pcre2 != null) + { + var pcre2Match = pcre2.InvokeRegexFunction(input, regex, options); + pcre2Expr = pcre2Match.ToExpression(); + } + + string prefix = null; + + if (nodeExpr != null && nodeExpr != dotnetExpr) + { + prefix = $"{kind}: node != net"; + } + + if (pcre2Expr != null && pcre2Expr != dotnetExpr) + { + prefix = $"{kind}: pcre2 != net"; + } + + if (prefix != null) + { + var report = + $" re='{regex}' options='{options}'\n" + + $" input='{input}' ({CharCodes(input)})\n" + + $" net={dotnetExpr}\n" + + (nodeExpr != null ? $" node={nodeExpr}\n" : string.Empty) + + (pcre2Expr != null ? $" pcre2={pcre2Expr}\n" : string.Empty); + + throw new Exception($"{prefix}\n{report}"); + } + + return dotnetMatch; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + var result = InvokeRegexFunctionOne(input, regex, options, dotnet, node, pcre2, "main"); + + if (dotnet_alt != null) + { + InvokeRegexFunctionOne(input, regex, options, dotnet_alt, node_alt, pcre2_alt, "alt"); + } + + return result; + } + } + + internal class Compare_IsMatchImplementation : Compare_CommonImplementation + { + protected override string DefaultRegexOptions => DefaultIsMatchOptions; + + internal Compare_IsMatchImplementation(TimeSpan regexTimeout, bool includeNode, bool includePCRE2) + { + dotnet = new Library.IsMatchImplementation(regexTimeout); + dotnet_alt = new Library.MatchImplementation(regexTimeout); + + if (includeNode) + { + node = new NodeJS_IsMatchImplementation(regexTimeout); + node_alt = new NodeJS_MatchImplementation(regexTimeout); + } + + if (includePCRE2) + { + pcre2 = new PCRE2_IsMatchImplementation(regexTimeout); + pcre2_alt = new PCRE2_MatchImplementation(regexTimeout); + } + } + } + + internal class Compare_MatchImplementation : Compare_CommonImplementation + { + protected override string DefaultRegexOptions => DefaultMatchOptions; + + internal Compare_MatchImplementation(TimeSpan regexTimeout, bool includeNode, bool includePCRE2) + { + if (includeNode) + { + node = new NodeJS_MatchImplementation(regexTimeout); + } + + if (includePCRE2) + { + pcre2 = new PCRE2_MatchImplementation(regexTimeout); + } + + dotnet = new Library.MatchImplementation(regexTimeout); + } + } + + internal class Compare_MatchAllImplementation : Compare_CommonImplementation + { + protected override string DefaultRegexOptions => DefaultMatchAllOptions; + + internal Compare_MatchAllImplementation(TimeSpan regexTimeout, bool includeNode, bool includePCRE2) + { + if (includeNode) + { + node = new NodeJS_MatchAllImplementation(regexTimeout); + } + + if (includePCRE2) + { + pcre2 = new PCRE2_MatchAllImplementation(regexTimeout); + } + + dotnet = new Library.MatchAllImplementation(regexTimeout); + } + } + } +} + +#endif diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_NodeJS.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_NodeJS.cs new file mode 100644 index 0000000000..8f157b0f00 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_NodeJS.cs @@ -0,0 +1,330 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// This file implements our Regular Expression functions using ECMAScript hosted by Node.js. +// We run tests with this to find semantic differences between our regular expression language and what the JavaScript runtime (Canvas) supports. + +#if MATCHCOMPARE + +using System; +using System.Collections.Generic; +using System.ComponentModel; +using System.Diagnostics; +using System.Linq; +using System.Runtime.Serialization; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.CodeAnalysis.Differencing; +using Microsoft.PowerFx.Core.Functions; +using Microsoft.PowerFx.Core.IR; +using Microsoft.PowerFx.Core.Texl.Builtins; +using Microsoft.PowerFx.Syntax; +using Microsoft.PowerFx.Types; +using Newtonsoft.Json.Linq; + +namespace Microsoft.PowerFx.Functions +{ + public class RegEx_NodeJS + { + private static Process node; + private static readonly Mutex NodeMutex = new Mutex(); // protect concurrent access to the node process + + private static TaskCompletionSource readTask = null; + private static string output; + private static string error; + + private static void OutputHandler(object sendingProcess, DataReceivedEventArgs outLine) + { + output = output + outLine.Data; + if (outLine.Data.Contains("%%end%%")) + { + readTask.TrySetResult(true); + } + } + + private static void ErrorHandler(object sendingProcess, DataReceivedEventArgs outLine) + { + error = error + outLine.Data; + readTask.TrySetResult(true); + } + + private class JSMatch + { + public int Index { get; set; } + + public string[] Numbered { get; set; } + + public Dictionary Named { get; set; } + } + + internal abstract class NodeJS_RegexCommonImplementation : Library.RegexCommonImplementation + { + internal static FormulaValue Match(string subject, string pattern, string flags, bool matchAll = false) + { + NodeMutex.WaitOne(); + Task task = Task.Run(async () => await MatchAsync(subject, pattern, flags, matchAll)); + NodeMutex.ReleaseMutex(); + return task.Result; + } + + internal static async Task MatchAsync(string subject, string pattern, string flags, bool matchAll = false) + { + var js = new StringBuilder(); + + output = string.Empty; + error = string.Empty; + readTask = new TaskCompletionSource(); + + try + { + js.Append($"MatchTest('{subject.Replace("\\", "\\\\").Replace("\r", "\\r").Replace("\n", "\\n").Replace("'", "\\'")}',"); + js.Append($"'{pattern.Replace("\\", "\\\\").Replace("\r", "\\r").Replace("\n", "\\n").Replace("'", "\\'")}',"); + js.Append($"'{flags}',"); + js.Append($"{(matchAll ? "true" : "false")});"); + +#if false + // for debugging unicode passing of strings to Node, output ignored by deserializer but visible in the debugger + js.AppendLine(@" + for (var i = 0; i < subject.length; i++) + { + console.log(subject[i] + '> ' + subject.charCodeAt(i).toString(16)); + } + "); +#endif + + if (node == null) + { + string js2 = @" + function MatchTest( subject, pattern, flags, matchAll ) + { + const [alteredPattern, alteredFlags] = AlterRegex_JavaScript( pattern, flags ); + const regex = RegExp(alteredPattern, alteredFlags.concat(matchAll ? 'g' : '')); + const matches = matchAll ? [...subject.matchAll(regex)] : [subject.match(regex)]; + // console.log(alteredPattern); // useful to debug AlterRegex_JavaScript + console.log('%%begin%%'); + if (matches.length != 0 && matches[0] != null) + { + var arr = new Array(); + for (const match of matches) + { + var o = new Object(); + o.Index = match.index; + o.Named = match.groups; + o.Numbered = match; + arr.push(o); + } + console.log(JSON.stringify(arr)); + } + console.log('%%end%%'); + } + "; + + node = new Process(); + node.StartInfo.FileName = "node.exe"; + node.StartInfo.Arguments = "-i"; + node.StartInfo.RedirectStandardInput = true; + node.StartInfo.RedirectStandardOutput = true; + node.StartInfo.RedirectStandardError = true; + node.StartInfo.CreateNoWindow = true; + node.StartInfo.UseShellExecute = false; + node.StartInfo.StandardOutputEncoding = System.Text.Encoding.UTF8; + + // Not supported by .NET framework 4.6.2, we need to use the manual GetBytes method below + // node.StartInfo.StandardInputEncoding = System.Text.Encoding.UTF8; + + node.OutputDataReceived += OutputHandler; + node.ErrorDataReceived += ErrorHandler; + + node.Start(); + + node.BeginOutputReadLine(); + node.BeginErrorReadLine(); + + await node.StandardInput.WriteLineAsync(RegEx_JavaScript.AlterRegex_JavaScript); + await node.StandardInput.WriteLineAsync(js2); + } + + var jsString = js.ToString(); + + var bytes = Encoding.UTF8.GetBytes(jsString); +#pragma warning disable CA1835 + await node.StandardInput.BaseStream.WriteAsync(bytes, 0, bytes.Length); +#pragma warning restore CA1835 + await node.StandardInput.WriteLineAsync(); + + await node.StandardInput.FlushAsync(); + + var complete = await Task.WhenAny(readTask.Task, Task.Delay(TimeSpan.FromSeconds(3))); + + if (complete != readTask.Task) + { + error = "NodeJS Timeout"; + node.Close(); + node = null; + } + + if (error.Length > 0) + { + throw new InvalidOperationException(error); + } + + int begin = output.IndexOf("%%begin%%"); + int end = output.IndexOf("%%end%%"); + + // In x mode, comment line endings are [\r\n], but .NET only supports \n. For our purposes here, we can just replace the \r. + pattern = pattern.Replace('\r', '\n'); + var type = new KnownRecordType(GetRecordTypeFromRegularExpression(pattern, (flags.Contains('N') ? RegexOptions.None : RegexOptions.ExplicitCapture) | (flags.Contains('x') ? RegexOptions.IgnorePatternWhitespace : RegexOptions.None))); + + if (end == begin + 9) + { + return matchAll ? FormulaValue.NewTable(type) : new BlankValue(IRContext.NotInSource(type)); + } + + string json = output.Substring(begin + 9, end - begin - 9); + var result = JsonSerializer.Deserialize(json); + + List allMatches = new (); + + foreach (JSMatch match in result) + { + Dictionary fields = new Dictionary() + { + { STARTMATCH, new NamedValue(STARTMATCH, NumberValue.New(Convert.ToDouble(match.Index) + 1)) }, + { FULLMATCH, new NamedValue(FULLMATCH, match.Numbered[0] == null ? BlankValue.NewBlank(FormulaType.String) : StringValue.New(match.Numbered[0])) }, + }; + + if (match.Named != null) + { + foreach (var name in type.FieldNames) + { + if (name != STARTMATCH && name != FULLMATCH && !(name == SUBMATCHES && type.GetFieldType(SUBMATCHES) != FormulaType.String)) + { + fields.Add(name, new NamedValue(name, match.Named.ContainsKey(name) ? StringValue.New(match.Named[name]) : BlankValue.NewBlank(FormulaType.String))); + } + } + } + + if (flags.Contains('N')) + { + List subMatches = new List(); + + for (int i = 1; i < match.Numbered.Count(); i++) + { + var n = match.Numbered[i]; + subMatches.Add(FormulaValue.NewRecordFromFields(new NamedValue(TableValue.ValueName, n == null ? BlankValue.NewBlank(FormulaType.String) : StringValue.New(n)))); + } + + var recordType = RecordType.Empty().Add(TableValue.ValueName, FormulaType.String); + fields.Add(SUBMATCHES, new NamedValue(SUBMATCHES, TableValue.NewTable(recordType, subMatches))); + } + + allMatches.Add(RecordValue.NewRecordFromFields(fields.Values)); + } + + return matchAll ? FormulaValue.NewTable(allMatches.First().Type, allMatches) : allMatches.First(); + } + catch (Exception e) + { +#pragma warning disable CA2200 + // rethrow here just so we can debug the exception in the task + throw e; +#pragma warning restore CA2200 + } + } + } + + public static void EnableRegExFunctions(PowerFxConfig config, TimeSpan regExTimeout = default, int regexCacheSize = -1) + { + RegexTypeCache regexTypeCache = new (regexCacheSize); + + foreach (KeyValuePair func in RegexFunctions(regExTimeout, regexTypeCache)) + { + if (config.SymbolTable.Functions.AnyWithName(func.Key.Name)) + { + throw new InvalidOperationException("Cannot add RegEx functions more than once."); + } + + config.SymbolTable.AddFunction(func.Key); + config.AdditionalFunctions.Add(func.Key, func.Value); + } + } + + internal static Dictionary RegexFunctions(TimeSpan regexTimeout, RegexTypeCache regexCache) + { + if (regexTimeout == TimeSpan.Zero) + { + regexTimeout = new TimeSpan(0, 0, 1); + } + + if (regexTimeout.TotalMilliseconds < 0) + { + throw new ArgumentOutOfRangeException(nameof(regexTimeout), "Timeout duration for regular expression execution must be positive."); + } + + return new Dictionary() + { + { new IsMatchFunction(), new NodeJS_IsMatchImplementation(regexTimeout) }, + { new MatchFunction(regexCache), new NodeJS_MatchImplementation(regexTimeout) }, + { new MatchAllFunction(regexCache), new NodeJS_MatchAllImplementation(regexTimeout) } + }; + } + + internal class NodeJS_IsMatchImplementation : NodeJS_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultIsMatchOptions; + + public NodeJS_IsMatchImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + var match = Match(input, regex, options); + + return new BooleanValue(IRContext.NotInSource(FormulaType.Boolean), !match.IsBlank()); + } + } + + internal class NodeJS_MatchImplementation : NodeJS_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultMatchOptions; + + public NodeJS_MatchImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + return Match(input, regex, options); + } + } + + internal class NodeJS_MatchAllImplementation : NodeJS_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultMatchAllOptions; + + public NodeJS_MatchAllImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + return Match(input, regex, options, matchAll: true); + } + } + } +} + +#endif diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_PCRE2.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_PCRE2.cs new file mode 100644 index 0000000000..1906739941 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/Helpers/LibraryRegEx_PCRE2.cs @@ -0,0 +1,465 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#if MATCHCOMPARE + +// This file implements our Regular Expression functions using PCRE2 instead of .NET. +// We run tests with this to find semantic differences between our regular expression language and what Excel supports. +// To run this code, make sure that pcre2-32d.dll in your path, built from https://github.com/PCRE2Project/pcre2 +// with cmake-gui, shared library, AnyCRLF, UTF and UDP suppport. When done properly, "pcre2test -C" will display: +// C:\>pcre2test -C +// PCRE2 version 10.44 2024-06-07 +// Compiled with +// 8-bit support +// 16-bit support +// 32-bit support +// UTF and UCP support (Unicode version 15.0.0) +// No just-in-time compiler support +// Default newline sequence is ANYCRLF +// \R matches all Unicode newlines +// \C is supported +// Internal link size = 2 +// Parentheses nest limit = 250 +// Default heap limit = 20000000 kibibytes +// Default match limit = 10000000 +// Default depth limit = 10000000 +// pcre2test has neither libreadline nor libedit support + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Reflection; +using System.Runtime.InteropServices; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using Microsoft.PowerFx.Core.Functions; +using Microsoft.PowerFx.Core.IR; +using Microsoft.PowerFx.Core.Texl.Builtins; +using Microsoft.PowerFx.Types; + +namespace Microsoft.PowerFx.Functions +{ + public class RegEx_PCRE2 + { + internal abstract class PCRE2_RegexCommonImplementation : Library.RegexCommonImplementation + { + internal static class NativeMethods + { + // use 32 bit version as PCRE2 as it doesn't support surrogate pairs, we manually convert in/out of surrogate pairs to UTF-32. + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern IntPtr pcre2_compile_32(byte[] pattern, int patternLength, uint patternOptions, ref int errorNumber, ref int errorOffset, IntPtr context); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_match_32(IntPtr code, byte[] subject, int subjectLength, int subjectOffset, uint subjectOptions, IntPtr matchData, IntPtr matchContext); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_exec_32(IntPtr code, IntPtr extra, byte[] subject, int subjectLength, int subjectOffset, uint subjectOptions, IntPtr ovector, IntPtr ovectorSize); + + [DllImport("pcre2-32.dll")] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern IntPtr pcre2_match_data_create_32(int ovecSize, IntPtr generalContext); + + [DllImport("pcre2-32.dll")] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern IntPtr pcre2_match_data_create_from_pattern_32(IntPtr code, IntPtr generalContext); + + [DllImport("pcre2-32.dll")] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_get_startchar_32(IntPtr matchData); + + [DllImport("pcre2-32.dll")] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_get_ovector_count_32(IntPtr matchData); + + [DllImport("pcre2-32.dll")] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern IntPtr pcre2_get_ovector_pointer_32(IntPtr matchData); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_substring_number_from_name_32(IntPtr code, byte[] name); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern void pcre2_match_data_free_32(IntPtr data); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern void pcre2_code_free_32(IntPtr code); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_get_error_message_32(int code, IntPtr buffer, int bufferLength); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern int pcre2_set_compile_extra_options_32(IntPtr context, uint extraOptions); + + [DllImport("pcre2-32.dll", CharSet = CharSet.Unicode)] + [DefaultDllImportSearchPaths(DllImportSearchPath.SafeDirectories)] + internal static extern IntPtr pcre2_compile_context_create_32(IntPtr generalContext); + } + + internal enum PCRE2_OPTIONS : uint + { + CASELESS = 0x00000008, + MULTILINE = 0x00000400, + DOTALL = 0x00000020, + EXTENDED = 0x00000080, + UCP = 0x00020000, + UTF = 0x00080000, + NO_AUTO_CAPTURE = 0x00002000, + } + + internal enum PCRE2_EXTRA_OPTIONS : uint + { + ALLOW_SURROGATE_ESCAPES = 0x00000001, + } + + internal enum PCRE2_MATCH_OPTIONS : uint + { + NOTEMPTY = 0x00000004, + NOTEMPTY_ATSTART = 0x00000008, + } + + private static readonly Mutex PCRE2Mutex = new Mutex(); // protect concurrent access to the node process + + private static string Extract(byte[] bytes, int start, int end) + { + StringBuilder result = new StringBuilder(); + + for (int i = start; i < end; i++) + { + int number = BitConverter.ToInt32(bytes, i * 4); + string s = char.ConvertFromUtf32(number); + result.Append(s); + } + + result.Replace('\uf8ff', '\u180e'); + + return result.ToString(); + } + + internal static FormulaValue Match(string subject, string pattern, string flags, bool matchAll = false) + { + int errorNumber = 0; + int errorOffset = 0; + IntPtr matchContext = (IntPtr)0; + IntPtr generalContext = (IntPtr)0; + + PCRE2_OPTIONS pcreOptions = PCRE2_OPTIONS.UTF | PCRE2_OPTIONS.UCP; + RegexOptions options = RegexOptions.None; + + Match inlineOptions = Regex.Match(pattern, @"^\(\?([imnsx]+)\)"); + if (inlineOptions.Success) + { + flags = flags + inlineOptions.Groups[1]; + pattern = pattern.Substring(inlineOptions.Length); + } + + if (!flags.Contains('N')) + { + pcreOptions |= PCRE2_OPTIONS.NO_AUTO_CAPTURE; + options |= RegexOptions.ExplicitCapture; + } + + if (flags.Contains('i')) + { + pcreOptions |= PCRE2_OPTIONS.CASELESS; + options |= RegexOptions.IgnoreCase; + } + + if (flags.Contains('m')) + { + pcreOptions |= PCRE2_OPTIONS.MULTILINE; + options |= RegexOptions.Multiline; + } + + if (flags.Contains('s')) + { + pcreOptions |= PCRE2_OPTIONS.DOTALL; + options |= RegexOptions.Singleline; + } + + if (flags.Contains('x')) + { + // replace the three characters that PCRE2 recognizes as white space that Power Fx does not + // add a \n so that we can add a $ at the end, in case there is an unterminated pound comment + pattern = pattern.Replace("\u000b", "\\u000b").Replace("\u2028", "\\u2028").Replace("\u2029", "\\u2029") + '\n'; + pcreOptions |= PCRE2_OPTIONS.EXTENDED; + options |= RegexOptions.IgnorePatternWhitespace; + } + + if (flags.Contains('^') && (pattern.Length == 0 || pattern[0] != '^')) + { + pattern = "^" + pattern; + } + + if (flags.Contains('$') && (pattern.Length == 0 || pattern[pattern.Length - 1] != '$')) + { + pattern = pattern + "$"; + } + + // convert out of surrogate pairs and into UTF-32 for the pattern manually + // convesion of the subject is handled with Encoding.UTF32.GetBytes below + + StringBuilder patternSurrogates = new StringBuilder(); + + for (int i = 0; i < pattern.Length; i++) + { + if (i + 11 < pattern.Length && pattern[i] == '\\' && pattern[i + 1] == 'u' && pattern[i + 6] == '\\' && pattern[i + 7] == 'u') + { + var s1 = Convert.ToInt32(Convert.ToInt32(pattern.Substring(i + 2, 4), 16)); + var s2 = Convert.ToInt32(Convert.ToInt32(pattern.Substring(i + 8, 4), 16)); + if (s1 >= 0xd800 && s1 <= 0xdbff && s2 >= 0xdc00 && s2 <= 0xdfff) + { + patternSurrogates.Append("\\x{" + Convert.ToString(((s1 - 0xd800) * 0x400) + (s2 - 0xdc00) + 0x10000, 16) + "}"); + i += 11; + } + } + else if (i + 5 < pattern.Length && pattern[i] == '\\' && pattern[i + 1] == 'u') + { + patternSurrogates.Append("\\x{" + pattern[i + 2] + pattern[i + 3] + pattern[i + 4] + pattern[i + 5] + "}"); + i += 5; + } + else + { + patternSurrogates.Append(pattern[i]); + } + } + + var patternPCRE2 = patternSurrogates.ToString(); + + PCRE2Mutex.WaitOne(); + + var context = NativeMethods.pcre2_compile_context_create_32(generalContext); + +#if false + // not needed as we convert out of surrogate pairs above + NativeMethods.pcre2_set_compile_extra_options_32(context, (uint)PCRE2_EXTRA_OPTIONS.ALLOW_SURROGATE_ESCAPES); +#endif + + var code = NativeMethods.pcre2_compile_32(Encoding.UTF32.GetBytes(patternPCRE2), -1, (uint)pcreOptions, ref errorNumber, ref errorOffset, context); + if (code == IntPtr.Zero) + { + byte[] buffer = new byte[4096]; + + GCHandle pinnedArray = GCHandle.Alloc(buffer, GCHandleType.Pinned); + IntPtr pointer = pinnedArray.AddrOfPinnedObject(); + + NativeMethods.pcre2_get_error_message_32(errorNumber, pointer, buffer.Length); + var message = System.Text.Encoding.Unicode.GetString(buffer); + var fullMessage = $"PCRE2 error compiling {patternPCRE2}, errorNumber={errorNumber} ({message}), errorOffset={errorOffset}"; + + pinnedArray.Free(); + + PCRE2Mutex.ReleaseMutex(); + throw new Exception(fullMessage); + } + + var md = NativeMethods.pcre2_match_data_create_from_pattern_32(code, generalContext); + + var startMatch = 0; + List allMatches = new (); + + // PCRE2 uses an older definition of Unicode where 180e is a space character, moving it to something else (used defined cahracter) here for category comparisons tests + subject = subject.Replace('\u180e', '\uf8ff'); + + var subjectBytes = Encoding.UTF32.GetBytes(subject); + PCRE2_MATCH_OPTIONS matchOptions = 0; + while (startMatch >= 0 && NativeMethods.pcre2_match_32(code, subjectBytes, -1, startMatch, (uint)matchOptions, md, matchContext) > 0) + { + Dictionary fields = new (); + + var sc = NativeMethods.pcre2_get_startchar_32(md); + fields.Add(STARTMATCH, new NamedValue(STARTMATCH, NumberValue.New((double)sc + 1))); + + IntPtr op = NativeMethods.pcre2_get_ovector_pointer_32(md); + var start0 = Marshal.ReadInt32(op, 0); + var end0 = Marshal.ReadInt32(op, Marshal.SizeOf(typeof(long))); + fields.Add(FULLMATCH, new NamedValue(FULLMATCH, StringValue.New(Extract(subjectBytes, start0, end0)))); + + // for next iteration + if (matchAll) + { + startMatch = end0; + if (end0 == start0) + { +#if false + startMatch++; +#else + if (matchOptions == 0) + { + matchOptions = PCRE2_MATCH_OPTIONS.NOTEMPTY_ATSTART; + } + else + { + throw new Exception("PCRE2 repeated empty result"); + } +#endif + } + else + { + matchOptions = 0; + } + } + else + { + startMatch = -1; + } + + List subMatches = new List(); + var oc = NativeMethods.pcre2_get_ovector_count_32(md); + for (var i = 1; i < oc; i++) + { + var start = Marshal.ReadInt32(op, i * 2 * Marshal.SizeOf(typeof(long))); + var end = Marshal.ReadInt32(op, ((i * 2) + 1) * Marshal.SizeOf(typeof(long))); + if (start >= 0 && end >= 0) + { + subMatches.Add(StringValue.New(Extract(subjectBytes, start, end))); + } + else + { + subMatches.Add(BlankValue.NewBlank(FormulaType.String)); + } + } + + if (!fields.ContainsKey(SUBMATCHES) && (options & RegexOptions.ExplicitCapture) == 0) + { + var recordType = RecordType.Empty().Add(TableValue.ValueName, FormulaType.String); + fields.Add(SUBMATCHES, new NamedValue(SUBMATCHES, TableValue.NewTable(recordType, subMatches.Select(s => FormulaValue.NewRecordFromFields(new NamedValue(TableValue.ValueName, s)))))); + } + else + { + // In x mode, comment line endings are [\r\n], but .NET only supports \n. For our purposes here, we can just replace the \r. + pattern = pattern.Replace('\r', '\n'); + var regex = new Regex(pattern, options); + foreach (var name in regex.GetGroupNames()) + { + if (!int.TryParse(name, out _)) + { + var ni = NativeMethods.pcre2_substring_number_from_name_32(code, Encoding.UTF32.GetBytes(name)); + fields.Add(name, new NamedValue(name, subMatches[ni - 1])); + } + } + } + + allMatches.Add(RecordValue.NewRecordFromFields(fields.Values)); + } + + NativeMethods.pcre2_match_data_free_32(md); + NativeMethods.pcre2_code_free_32(code); + + PCRE2Mutex.ReleaseMutex(); + + if (allMatches.Count == 0) + { + return matchAll ? FormulaValue.NewTable(new KnownRecordType(GetRecordTypeFromRegularExpression(pattern, options))) + : new BlankValue(IRContext.NotInSource(new KnownRecordType(GetRecordTypeFromRegularExpression(pattern, options)))); + } + else + { + return matchAll ? FormulaValue.NewTable(allMatches.First().Type, allMatches) + : allMatches.First(); + } + } + } + + public static void EnableRegExFunctions(PowerFxConfig config, TimeSpan regExTimeout = default, int regexCacheSize = -1) + { + RegexTypeCache regexTypeCache = new (regexCacheSize); + + foreach (KeyValuePair func in RegexFunctions(regExTimeout, regexTypeCache)) + { + if (config.SymbolTable.Functions.AnyWithName(func.Key.Name)) + { + throw new InvalidOperationException("Cannot add RegEx functions more than once."); + } + + config.SymbolTable.AddFunction(func.Key); + config.AdditionalFunctions.Add(func.Key, func.Value); + } + } + + internal static Dictionary RegexFunctions(TimeSpan regexTimeout, RegexTypeCache regexCache) + { + if (regexTimeout == TimeSpan.Zero) + { + regexTimeout = new TimeSpan(0, 0, 1); + } + + if (regexTimeout.TotalMilliseconds < 0) + { + throw new ArgumentOutOfRangeException(nameof(regexTimeout), "Timeout duration for regular expression execution must be positive."); + } + + return new Dictionary() + { + { new IsMatchFunction(), new PCRE2_IsMatchImplementation(regexTimeout) }, + { new MatchFunction(regexCache), new PCRE2_MatchImplementation(regexTimeout) }, + { new MatchAllFunction(regexCache), new PCRE2_MatchAllImplementation(regexTimeout) } + }; + } + + internal class PCRE2_IsMatchImplementation : PCRE2_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultIsMatchOptions; + + public PCRE2_IsMatchImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + var match = Match(input, regex, options); + + return new BooleanValue(IRContext.NotInSource(FormulaType.Boolean), !match.IsBlank()); + } + } + + internal class PCRE2_MatchImplementation : PCRE2_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultMatchOptions; + + public PCRE2_MatchImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + return Match(input, regex, options); + } + } + + internal class PCRE2_MatchAllImplementation : PCRE2_RegexCommonImplementation + { + private readonly TimeSpan _regexTimeout; + + protected override string DefaultRegexOptions => DefaultMatchAllOptions; + + public PCRE2_MatchAllImplementation(TimeSpan regexTimeout) + { + _regexTimeout = regexTimeout; + } + + internal override FormulaValue InvokeRegexFunction(string input, string regex, string options) + { + return Match(input, regex, options, matchAll: true); + } + } + } +} + +#endif diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/IsMatch_V1CompatDisabled_Overrides.txt b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/IsMatch_V1CompatDisabled_Overrides.txt new file mode 100644 index 0000000000..e993cc1469 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/IsMatch_V1CompatDisabled_Overrides.txt @@ -0,0 +1,8 @@ +#override: IsMatch_V1CompatDisabled.txt +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +// The C# interpeter doesn't have a non-V1 implementation of the Match functions, so SubMatches will not appear in these results + +// Dangerous Regex, will timeout (should take >2h on a fast CPU) - but only with Pre-V1 or MatchOptions.NumberedSubMatches +>> IsMatch("ababababababababababababababababababababababababababababababababababa", "^((ab)*)+$") +false \ No newline at end of file diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/MatchAll_V1CompatDisabled_Overrides.txt b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/MatchAll_V1CompatDisabled_Overrides.txt new file mode 100644 index 0000000000..1c2d6e9aa1 --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/MatchAll_V1CompatDisabled_Overrides.txt @@ -0,0 +1,26 @@ +#override: MatchAll_V1CompatDisabled.txt +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +// The C# interpeter doesn't have a non-V1 implementation of the Match functions, so SubMatches will not appear in these results + +>> MatchAll("Hello", "\w") +Table({FullMatch:"H",StartMatch:1},{FullMatch:"e",StartMatch:2},{FullMatch:"l",StartMatch:3},{FullMatch:"l",StartMatch:4},{FullMatch:"o",StartMatch:5}) + +>> MatchAll("Bob Jones ", "<(?" & Match.Email & ")>") +Table({FullMatch:"",StartMatch:11,email:"bob.jones@contoso.com"}) + +>> MatchAll("PT2H1M39S", "PT(?:(?\d+)H)?(?:(?\d+)M)?(?:(?\d+)S)?") +Table({FullMatch:"PT2H1M39S",StartMatch:1,hours:"2",minutes:"1",seconds:"39"}) + +>> MatchAll("Hello", "(?\w)l(?\w)") +Table({FullMatch:"ell",StartMatch:2,p1:"e",p2:"l"}) + +>> MatchAll("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) +Table({FullMatch:"Joe 164",StartMatch:1},{FullMatch:"Sam 208",StartMatch:9}) + +>> MatchAll("Hello", "Hello", MatchOptions.IgnoreCase) +Table({FullMatch:"Hello",StartMatch:1}) + +>> MatchAll("Hi", "Hi", MatchOptions.Multiline) +Table({FullMatch:"Hi",StartMatch:1}) + diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/Match_V1CompatDisabled_Overrides.txt b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/Match_V1CompatDisabled_Overrides.txt new file mode 100644 index 0000000000..fd00013f7b --- /dev/null +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/InterpreterExpressionTestCases/Match_V1CompatDisabled_Overrides.txt @@ -0,0 +1,39 @@ +#override: Match_V1CompatDisabled.txt +#SETUP: RegEx,disable:PowerFxV1CompatibilityRules + +// The C# interpeter doesn't have a non-V1 implementation of the Match functions, so SubMatches will not appear in these results + +>> Match("Hello", "\w") +{FullMatch:"H",StartMatch:1} + +// The following tests return Blank() because SubMatches is correctly handled by the front end (because V1 is disabled), but the interpreter doesn't handle it propery (because V1 is enabled) + +>> Concat(ForAll(Match( "Bob Jones ", "<(?" & Match.Email & ")>").SubMatches, With({x:Value}, x)), Value, ", ") +Blank() + +>> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 1).Value +Blank() + +>> Index(Match("Hello", "(?\w)l(?\w)").SubMatches, 2).Value +Blank() + +>> Concat(ForAll(Match("Hello", "(?\w)l(?\w)").SubMatches, With({x:Value}, x)), Value, ", ") +Blank() + +>> Match("Hello", "(?\w)l(?\w)").SubMatches +Blank() + +>> Match("Joe 164" & Char(10) & "Sam 208" & Char(10), "(\w+)\s(\d+)", MatchOptions.Complete & MatchOptions.Multiline) +{FullMatch:"Joe 164",StartMatch:1} + +>> Match("JohnDoe@microsoft.com", Match.Email) +{FullMatch:"JohnDoe@microsoft.com",StartMatch:1} + +>> Match("(555) 123-4567", "^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$") +{FullMatch:"(555) 123-4567",StartMatch:1} + +>> Match("Hello", "Hello", MatchOptions.IgnoreCase) +{FullMatch:"Hello",StartMatch:1} + +>> Match("Hi", "Hi", MatchOptions.Multiline) +{FullMatch:"Hi",StartMatch:1} diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/PowerFxEvaluationTests.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/PowerFxEvaluationTests.cs index 6c1251a9f3..895cde5e07 100644 --- a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/PowerFxEvaluationTests.cs +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/PowerFxEvaluationTests.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.Drawing; using System.Globalization; using System.Linq; @@ -63,16 +64,31 @@ private static object EnableJsonFunctions(PowerFxConfig config, SymbolTable symb config.EnableJsonFunctions(); return null; } + +#if MATCHCOMPARE + // This "global" turns on regex comparison. Yes, it is a hack, but it is only used for manual testing (no automated testing). + public static bool RegExCompareNode = false; + public static bool RegExComparePCRE2 = false; +#endif private static object RegExSetup(PowerFxConfig config, SymbolTable symbolTable) - { -#pragma warning disable CS0618 // Type or member is obsolete - config.EnableRegExFunctions(new TimeSpan(0, 0, 5)); -#pragma warning restore CS0618 // Type or member is obsolete - - return null; - } - + { +#if MATCHCOMPARE + if (RegExCompareNode || RegExComparePCRE2) + { + Functions.RegEx_Compare.EnableRegExFunctions(config, new TimeSpan(0, 0, 5), includeNode: RegExCompareNode, includePCRE2: RegExComparePCRE2); + } + else +#endif + { +#pragma warning disable CS0618 // Type or member is obsolete + config.EnableRegExFunctions(new TimeSpan(0, 0, 5)); +#pragma warning restore CS0618 // Type or member is obsolete + } + + return null; + } + private static object BlobSetup(PowerFxConfig config, SymbolTable symbolTable) { config.AddBlobTestFunctions(); @@ -963,6 +979,11 @@ protected override async Task RunAsyncInternal(string expr, string se runtimeConfig.AddService(iSetup.TimeZoneInfo); } + if (iSetup.CultureInfo != null) + { + runtimeConfig.AddService(iSetup.CultureInfo); + } + if (engine.TryGetByName("traceRecord", out _)) { var traceRecord = engine.GetValue("traceRecord"); diff --git a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/RegExTests.cs b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/RegExTests.cs index c5d3207807..7947f26a4d 100644 --- a/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/RegExTests.cs +++ b/src/tests/Microsoft.PowerFx.Interpreter.Tests.Shared/RegExTests.cs @@ -2,6 +2,9 @@ // Licensed under the MIT license. using System; +using System.Linq; +using System.Security.Cryptography.X509Certificates; +using Microsoft.PowerFx.Types; using Xunit; namespace Microsoft.PowerFx.Interpreter.Tests @@ -46,5 +49,34 @@ public void TestRegExEnableTwice2() PowerFxConfig config2 = new PowerFxConfig(); config2.EnableRegExFunctions(TimeSpan.FromMilliseconds(50), 20); } + + // First of these iss a dangerous Regex, will timeout (should take >2h on a fast CPU) - but only MatchOptions.NumberedSubMatches + [Theory] + [InlineData("ababababababababababababababababababababababababababababababababababa", "^((ab)*)+$", true, true, true)] + [InlineData("ababababababababababababababababababababababababababababababababababa", "^((ab)*)+$", false, false, false)] + [InlineData("ababababababababababababababababababababababababababababababababababa", "^((ab)*)+$", false, false, false)] + public void TestRegExTimeoutWorks(string subject, string pattern, bool subMatches, bool expError, bool expBoolean) + { + PowerFxConfig config = new PowerFxConfig(); + config.EnableRegExFunctions(new TimeSpan(0, 0, 5)); + RecalcEngine engine = new RecalcEngine(config); + + var formula = $"IsMatch(\"{subject}\", \"{pattern}\" {(subMatches ? ", MatchOptions.NumberedSubMatches" : string.Empty)})"; + + FormulaValue fv = engine.Eval(formula, null, new ParserOptions { AllowsSideEffects = true }); + + if (expError) + { + Assert.True(fv is ErrorValue); + ErrorValue ev = (ErrorValue)fv; + Assert.Equal(ErrorKind.Timeout, ev.Errors.First().Kind); + } + else + { + Assert.True(fv is BooleanValue); + BooleanValue bv = (BooleanValue)fv; + Assert.Equal(expBoolean, bv.Value); + } + } } } diff --git a/src/tools/Repl/Program.cs b/src/tools/Repl/Program.cs index 116e3db275..d1603cac0f 100644 --- a/src/tools/Repl/Program.cs +++ b/src/tools/Repl/Program.cs @@ -42,6 +42,16 @@ public static class ConsoleRepl private const string OptionTextFirst = "TextFirst"; private static bool _textFirst = false; +#if MATCHCOMPARE + // to enable, place this in Solution Items/Directiory.Build.Props: + // + // $(DefineConstants);MATCHCOMPARE + // + + private const string OptionMatchCompare = "MatchCompare"; + private static bool _matchCompare = false; +#endif + private const string OptionUDF = "UserDefinedFunctions"; private static bool _enableUDFs = true; @@ -49,6 +59,8 @@ public static class ConsoleRepl private static StandardFormatter _standardFormatter; + private static CultureInfo _cultureInfo = CultureInfo.CurrentCulture; + private static bool _reset; private static RecalcEngine ReplRecalcEngine() @@ -70,6 +82,9 @@ private static RecalcEngine ReplRecalcEngine() { OptionHashCodes, OptionHashCodes }, { OptionStackTrace, OptionStackTrace }, { OptionTextFirst, OptionTextFirst }, +#if MATCHCOMPARE + { OptionMatchCompare, OptionMatchCompare }, +#endif { OptionUDF, OptionUDF }, }; @@ -97,10 +112,24 @@ private static RecalcEngine ReplRecalcEngine() config.AddFunction(new Option2Function()); config.AddFunction(new Run1Function()); config.AddFunction(new Run2Function()); + config.AddFunction(new Language1Function()); var optionsSet = new OptionSet("Options", DisplayNameUtility.MakeUnique(options)); - config.EnableRegExFunctions(new TimeSpan(0, 0, 5)); +#if MATCHCOMPARE + if (_matchCompare) + { + // requires PCRE2 DLL (pcre2-16d.dll) on the path and Node.JS installed + // can also use RegEx_PCRE2 and RegEx_NodeJS directly too + Functions.RegEx_Compare.EnableRegExFunctions(config, new TimeSpan(0, 0, 5)); + } + else +#endif + { +#pragma warning disable CS0618 // Type or member is obsolete + config.EnableRegExFunctions(new TimeSpan(0, 0, 5)); +#pragma warning restore CS0618 // Type or member is obsolete + } config.AddOptionSet(optionsSet); @@ -124,6 +153,8 @@ public static void Main() REPL(Console.In, prompt: true, echo: false, printResult: true, lineNumber: null); } +#pragma warning disable CS0618 + // Hook repl engine with customizations. private class MyRepl : PowerFxREPL { @@ -135,6 +166,10 @@ public MyRepl() this.ValueFormatter = _standardFormatter; this.HelpProvider = new MyHelpProvider(); + var bsp = new BasicServiceProvider(); + bsp.AddService(_cultureInfo); + this.InnerServices = bsp; + this.AllowSetDefinitions = true; this.AllowUserDefinedFunctions = _enableUDFs; this.AllowImport = true; @@ -262,6 +297,9 @@ public FormulaValue Execute() sb.Append(CultureInfo.InvariantCulture, $"{"StackTrace:",-42}{_stackTrace}\n"); sb.Append(CultureInfo.InvariantCulture, $"{"TextFirst:",-42}{_textFirst}\n"); sb.Append(CultureInfo.InvariantCulture, $"{"UserDefinedFunctions:",-42}{_enableUDFs}\n"); +#if MATCHCOMPARE + sb.Append(CultureInfo.InvariantCulture, $"{"MatchCompare:",-42}{_matchCompare}\n"); +#endif foreach (var prop in typeof(Features).GetProperties(BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)) { @@ -381,6 +419,15 @@ public FormulaValue Execute(StringValue option, BooleanValue value) return value; } +#if MATCHCOMPARE + if (string.Equals(option.Value, OptionMatchCompare, StringComparison.OrdinalIgnoreCase)) + { + _matchCompare = value.Value; + _reset = true; + return value; + } +#endif + if (string.Equals(option.Value, OptionPowerFxV1, StringComparison.OrdinalIgnoreCase)) { foreach (var prop in typeof(Features).GetProperties(BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)) @@ -426,6 +473,26 @@ public FormulaValue Execute(StringValue option, BooleanValue value) } } + // set the language + private class Language1Function : ReflectionFunction + { + public Language1Function() + : base("Language", FormulaType.Void, new[] { FormulaType.String }) + { + } + + public FormulaValue Execute(StringValue lang) + { + var cultureInfo = new CultureInfo(lang.Value); + + _cultureInfo = cultureInfo; + + _reset = true; + + return FormulaValue.NewVoid(); + } + } + private class MyHelpProvider : HelpProvider { public override async Task Execute(PowerFxREPL repl, CancellationToken cancel, string context = null) @@ -498,6 +565,8 @@ Use Option( Options.FormatTable, false ) to disable table formatting. Use Option() to see the list of all options with their current value. Use Help( ""Options"" ) for more information. +Use Language( ""en-US"" ) to set culture info. + Once a formula is defined or a variable's type is defined, it cannot be changed. Use Reset() to clear all formulas and variables. "; diff --git a/src/tools/Repl/Repl.csproj b/src/tools/Repl/Repl.csproj index c8f1f2f87e..c178e10b00 100644 --- a/src/tools/Repl/Repl.csproj +++ b/src/tools/Repl/Repl.csproj @@ -15,4 +15,12 @@ + + + + + + + +