Skip to content

Commit

Permalink
feat utils: add useful std::match_results methods to utils::match_res…
Browse files Browse the repository at this point in the history
…ults

commit_hash:2f08f8e3dbb838706abe14302b447e9fb1d0be6a
  • Loading branch information
Anton3 committed Jan 28, 2025
1 parent 59ea09d commit 48f7ee4
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 24 deletions.
21 changes: 20 additions & 1 deletion universal/include/userver/utils/regex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class RegexError : public std::exception {};
/// 2. quantifiers over 1000, regexes with large repetition counts consume more memory;
/// 3. spaces in quantifiers like `\w{1, 5}`;
/// 4. possessive quantifiers.
///
/// ## An example of complex string parsing using `utils::regex`
///
/// @snippet utils/regex_test.cpp split text
class regex final {
public:
/// Constructs a null regex, any usage except for copy/move is UB.
Expand Down Expand Up @@ -107,9 +111,24 @@ class match_results final {
/// @note Group 0 always matches the whole pattern. User groups start with index 1.
std::string_view operator[](std::size_t sub) const;

/// @returns the position of the first character of the capturing group @a sub within the target (haystack) string.
/// @note Group 0 always matches the whole pattern. User groups start with index 1.
/// @warning For empty groups, calling this method is UB. Group 0 is always valid.
std::size_t position(std::size_t sub) const;

/// @returns the length of the capturing group at @a sub.
/// @note Group 0 always matches the whole pattern. User groups start with index 1.
std::size_t length(std::size_t sub) const;

/// @returns the substring from the beginning of the target (haystack) string to the beginning of the full match.
std::string_view prefix() const;

/// @returns the substring from the end of the full match to the end of the target (haystack) string.
std::string_view suffix() const;

private:
struct Impl;
utils::FastPimpl<Impl, 104, 8> impl_;
utils::FastPimpl<Impl, 120, 8> impl_;

friend bool regex_match(std::string_view str, const regex& pattern);
friend bool regex_match(std::string_view str, match_results& m, const regex& pattern);
Expand Down
38 changes: 34 additions & 4 deletions universal/src/utils/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,17 @@ std::string regex::str() const { return std::string{GetPatternView()}; }
struct match_results::Impl {
Impl() = default;

void Prepare(const regex& pattern) {
void Prepare(std::string_view target, const regex& pattern) {
this->target = target;

UASSERT(pattern.impl_->regex);
const auto groups_count = pattern.impl_->GetCapturingGroupCount() + 1;
if (groups_count > groups.size()) {
groups.resize(groups_count);
}
}

std::string_view target;
boost::container::small_vector<re2::StringPiece, kGroupsSboSize> groups;
};

Expand All @@ -160,11 +163,38 @@ match_results::~match_results() = default;
std::size_t match_results::size() const { return impl_->groups.size(); }

std::string_view match_results::operator[](std::size_t sub) const {
UASSERT(impl_->groups.size() > sub);
UASSERT(sub < size());
const auto substr = impl_->groups[sub];
return {substr.data(), substr.size()};
}

std::size_t match_results::position(std::size_t sub) const {
UASSERT(sub < size());
const auto substr = impl_->groups[sub];
UINVARIANT(
sub == 0 || !substr.empty(),
fmt::format(
"Trying to access position of capturing group {}, which is empty (missing), target='{}'", sub, impl_->target
)
);
return substr.data() - impl_->target.data();
}

std::size_t match_results::length(std::size_t sub) const {
UASSERT(sub < size());
return impl_->groups[sub].size();
}

std::string_view match_results::prefix() const {
UASSERT_MSG(size() > 0, "Empty match_results object");
return impl_->target.substr(0, position(0));
}

std::string_view match_results::suffix() const {
UASSERT_MSG(size() > 0, "Empty match_results object");
return impl_->target.substr(position(0) + impl_->groups[0].size());
}

////////////////////////////////////////////////////////////////

bool regex_match(std::string_view str, const regex& pattern) {
Expand All @@ -178,7 +208,7 @@ bool regex_match(std::string_view str, const regex& pattern) {

bool regex_match(std::string_view str, match_results& m, const regex& pattern) {
UASSERT(pattern.impl_->regex);
m.impl_->Prepare(pattern);
m.impl_->Prepare(str, pattern);
return utils::Visit(
*pattern.impl_->regex,
[&](const re2::RE2& regex) {
Expand Down Expand Up @@ -214,7 +244,7 @@ bool regex_search(std::string_view str, const regex& pattern) {

bool regex_search(std::string_view str, match_results& m, const regex& pattern) {
UASSERT(pattern.impl_->regex);
m.impl_->Prepare(pattern);
m.impl_->Prepare(str, pattern);
return utils::Visit(
*pattern.impl_->regex,
[&](const re2::RE2& regex) {
Expand Down
161 changes: 149 additions & 12 deletions universal/src/utils/regex_test.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include <userver/utils/regex.hpp>

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <userver/utest/assert_macros.hpp>
#include <userver/utils/fast_scope_guard.hpp>
#include <userver/utils/regex.hpp>

USERVER_NAMESPACE_BEGIN

Expand Down Expand Up @@ -49,15 +51,14 @@ TEST(Regex, Match) {
}

TEST(Regex, MatchWithResult) {
utils::regex r("^[a-z][0-9]+");
const utils::regex r("^[a-z][0-9]+");

utils::match_results fail;
const std::string str_empty{};
constexpr std::string_view str_empty{};
EXPECT_FALSE(utils::regex_search(str_empty, fail, r));
ASSERT_EQ(fail.size(), 1);
const std::string_view empty = fail[0];
EXPECT_EQ(empty, str_empty);

utils::match_results success;
const std::string str{"a1234"};
constexpr std::string_view str{"a1234"};
EXPECT_TRUE(utils::regex_match(str, success, r));
ASSERT_EQ(success.size(), 1);
const std::string_view res = success[0];
Expand Down Expand Up @@ -145,24 +146,117 @@ TEST(Regex, SearchNegativeLookahead) {
EXPECT_FALSE(utils::regex_search(" bad42 ", match, r));
}

TEST(Regex, SearchEmptyCaptureGroupsGoldenTest) {
TEST(Regex, SearchMatchResultsMethods) {
const utils::regex r("(\\w{2})(\\d)");
constexpr std::string_view target = " foo ab4 bar";
utils::match_results match;
ASSERT_TRUE(utils::regex_search(target, match, r));

ASSERT_EQ(match.size(), 3);
EXPECT_EQ(match[0], "ab4");
EXPECT_EQ(match[0].data(), target.data() + 6);
EXPECT_EQ(match.position(0), 6);
EXPECT_EQ(match.length(0), 3);
EXPECT_EQ(match[1], "ab");
EXPECT_EQ(match[1].data(), target.data() + 6);
EXPECT_EQ(match.position(1), 6);
EXPECT_EQ(match.length(1), 2);
EXPECT_EQ(match[2], "4");
EXPECT_EQ(match[2].data(), target.data() + 8);
EXPECT_EQ(match.position(2), 8);
EXPECT_EQ(match.length(2), 1);
EXPECT_EQ(match.prefix(), " foo ");
EXPECT_EQ(match.prefix().data(), target.data());
EXPECT_EQ(match.suffix(), " bar");
EXPECT_EQ(match.suffix().data(), target.data() + 9);
}

namespace {

/// [split text]
// An example of complex regex parsing using 'prefix' and 'suffix' methods.
// Suppose that we want to split a text into words and also check that
// the first letter of each sentence is capitalized.
std::vector<std::string_view> SplitTextIntoWords(const std::string_view text) {
static const utils::regex word_regex("[a-zA-Z]+");
static const utils::regex punctuation_regex("[., ]*");
static const utils::regex capitalized_word_start_regex("^[A-Z]");

std::vector<std::string_view> words;
utils::match_results word_match;
auto remaining = text;

while (utils::regex_search(remaining, word_match, word_regex)) {
const auto punctuation = word_match.prefix();
if (!utils::regex_match(punctuation, punctuation_regex)) {
throw std::invalid_argument(fmt::format("Invalid characters '{}'", punctuation));
}

const auto word = word_match[0];
const bool should_be_capitalized = words.empty() || punctuation.find('.') != std::string_view::npos;
if (should_be_capitalized && !utils::regex_search(word, capitalized_word_start_regex)) {
throw std::invalid_argument(fmt::format("Word '{}' should be capitalized", word));
}

words.push_back(word);

remaining = word_match.suffix();
}

if (!utils::regex_match(remaining, punctuation_regex)) {
throw std::invalid_argument(fmt::format("Invalid characters '{}'", remaining));
}

return words;
}

TEST(Regex, SplitTextIntoWords) {
EXPECT_THAT(
SplitTextIntoWords("Foo bar. Baz, qux quux."), testing::ElementsAre("Foo", "bar", "Baz", "qux", "quux")
);
UEXPECT_THROW_MSG(SplitTextIntoWords("Foo + bar"), std::invalid_argument, "Invalid characters ' + '");
UEXPECT_THROW_MSG(SplitTextIntoWords("Foo bar. baz."), std::invalid_argument, "Word 'baz' should be capitalized");
UEXPECT_THROW_MSG(SplitTextIntoWords("Foo, bar% "), std::invalid_argument, "Invalid characters '% '");
}
/// [split text]

} // namespace

TEST(RegexDeathTest, SearchEmptyCaptureGroupsGoldenTest) {
// There could be 2 interpretations of this situation:
// 1. the 2nd capture group of `r` is not present in `str`;
// 2. the 2nd capture group of `r` is present in `str`, but is empty.
// The current implementation of utils::regex chooses interpretation (2), but it's not guaranteed.
const utils::regex r("<([a-z]+)(\\d*)>");
constexpr std::string_view str = " <abc> ";
utils::match_results matches;
EXPECT_TRUE(utils::regex_search(str, matches, r));
ASSERT_TRUE(utils::regex_search(str, matches, r));

ASSERT_TRUE(matches.size() == 3);
EXPECT_EQ(matches[0], "<abc>");
EXPECT_EQ(matches[0].data(), str.data() + 1);
EXPECT_EQ(matches[1], "abc");
EXPECT_EQ(matches[1].data(), str.data() + 2);
EXPECT_EQ(matches[2], "");
EXPECT_EQ(matches[2].data(), str.data() + 5);
EXPECT_EQ(matches[2].data(), str.data() + 5); // implementation detail

EXPECT_EQ(matches.position(0), 1);
EXPECT_EQ(matches.length(0), 5);
EXPECT_EQ(matches.position(1), 2);
EXPECT_EQ(matches.length(1), 3);
EXPECT_UINVARIANT_FAILURE_MSG(
matches.position(2),
"Trying to access position of capturing group 2, which is empty (missing), target=' <abc> '"
);
EXPECT_EQ(matches.length(2), 0);

EXPECT_EQ(matches.prefix(), " ");
EXPECT_EQ(matches.prefix().data(), str.data());
EXPECT_EQ(matches.suffix(), " ");
EXPECT_EQ(matches.suffix().data(), str.data() + 6);
}

TEST(Regex, SearchNonPresentCaptureGroupsGoldenTest) {
TEST(RegexDeathTest, SearchNonPresentCaptureGroupsGoldenTest) {
// 2nd capture group cannot be present in `r` in any way (otherwise nested <> would have to be present),
// so utils::regex must return an invalid std::string_view for the 2nd group.
// The current implementation returns `nullptr` std::string_view, but the exact value of `.data()`
Expand All @@ -171,12 +265,55 @@ TEST(Regex, SearchNonPresentCaptureGroupsGoldenTest) {
constexpr std::string_view str = " <abc> ";
utils::match_results matches;
EXPECT_TRUE(utils::regex_search(str, matches, r));

ASSERT_TRUE(matches.size() == 3);
EXPECT_EQ(matches[0], "<abc>");
EXPECT_EQ(matches[0].data(), str.data() + 1);
EXPECT_EQ(matches[1], "abc");
EXPECT_EQ(matches[1].data(), str.data() + 2);
EXPECT_EQ(matches[2], "");
EXPECT_EQ(matches[2].data(), nullptr);
EXPECT_EQ(matches[2].data(), nullptr); // implementation detail

EXPECT_EQ(matches.position(0), 1);
EXPECT_EQ(matches.length(0), 5);
EXPECT_EQ(matches.position(1), 2);
EXPECT_EQ(matches.length(1), 3);
EXPECT_UINVARIANT_FAILURE_MSG(
matches.position(2),
"Trying to access position of capturing group 2, which is empty (missing), target=' <abc> '"
);
EXPECT_EQ(matches.length(2), 0);

EXPECT_EQ(matches.prefix(), " ");
EXPECT_EQ(matches.prefix().data(), str.data());
EXPECT_EQ(matches.suffix(), " ");
EXPECT_EQ(matches.suffix().data(), str.data() + 6);
}

TEST(RegexDeathTest, SearchEmptyResult) {
// Create an empty, but non-null string_view.
constexpr std::string_view kOriginalString = "foo";
constexpr auto kEmptySubstring = kOriginalString.substr(1, 0);

const utils::regex r("(\\d*)");
utils::match_results matches;
EXPECT_TRUE(utils::regex_search(kEmptySubstring, matches, r));

ASSERT_EQ(matches.size(), 2);
EXPECT_EQ(matches[0], "");
EXPECT_EQ(matches[0].data(), kOriginalString.data() + 1); // guaranteed
EXPECT_EQ(matches[1], "");
EXPECT_EQ(matches[1].data(), kOriginalString.data() + 1); // implementation detail
EXPECT_EQ(matches.position(0), 0);
EXPECT_EQ(matches.length(0), 0);
EXPECT_UINVARIANT_FAILURE_MSG(
matches.position(1), "Trying to access position of capturing group 1, which is empty (missing), target=''"
);
EXPECT_EQ(matches.length(0), 0);
EXPECT_EQ(matches.prefix(), "");
EXPECT_EQ(matches.prefix().data(), kOriginalString.data() + 1);
EXPECT_EQ(matches.suffix(), "");
EXPECT_EQ(matches.suffix().data(), kOriginalString.data() + 1);
}

TEST(Regex, Replace) {
Expand Down
4 changes: 2 additions & 2 deletions universal/utest/include/userver/utest/assert_macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ std::string AssertThrow(

std::string AssertNoThrow(std::function<void()> statement, std::string_view statement_text);

std::string QuoteStringForRegex(std::string_view message);
testing::Matcher<const std::string&> MakeHasSubstrMatcher(std::string_view expected);

} // namespace utest::impl

Expand Down Expand Up @@ -132,7 +132,7 @@ USERVER_NAMESPACE_END
#else
// NOLINTNEXTLINE (cppcoreguidelines-macro-usage)
#define EXPECT_UINVARIANT_FAILURE_MSG(statement, message_substring) \
UEXPECT_DEATH(statement, USERVER_NAMESPACE::utest::impl::QuoteStringForRegex(message_substring))
UEXPECT_DEATH(statement, USERVER_NAMESPACE::utest::impl::MakeHasSubstrMatcher(message_substring))
#endif
/// @endcond

Expand Down
4 changes: 2 additions & 2 deletions universal/utest/include/userver/utest/death_tests.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ USERVER_NAMESPACE_END
///
/// @hideinitializer
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define UEXPECT_DEATH(statement, message) \
#define UEXPECT_DEATH(statement, regex_or_matcher) \
for (USERVER_NAMESPACE::utest::impl::DeathTestScope utest_impl_death_test_scope; \
utest_impl_death_test_scope.ShouldKeepIterating(); \
utest_impl_death_test_scope.StopIterating()) \
EXPECT_DEATH(statement, message)
EXPECT_DEATH(statement, regex_or_matcher)
6 changes: 3 additions & 3 deletions universal/utest/src/utest/assert_macros.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <userver/utest/assert_macros.hpp>

#include <fmt/format.h>
#include <re2/re2.h>
#include <gmock/gmock.h>

#include <userver/compiler/demangle.hpp>
#include <userver/logging/stacktrace_cache.hpp>
Expand Down Expand Up @@ -138,8 +138,8 @@ std::string AssertNoThrow(std::function<void()> statement, std::string_view stat
}
}

std::string QuoteStringForRegex(std::string_view message) {
return re2::RE2::QuoteMeta(re2::StringPiece{message.data(), message.size()});
testing::Matcher<const ::std::string&> MakeHasSubstrMatcher(std::string_view expected) {
return testing::HasSubstr(std::string{expected});
}

} // namespace utest::impl
Expand Down

0 comments on commit 48f7ee4

Please sign in to comment.