-
Notifications
You must be signed in to change notification settings - Fork 1
/
about_regular_expressions.rb
177 lines (137 loc) · 6.25 KB
/
about_regular_expressions.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
require File.expand_path(File.dirname(__FILE__) + '/neo')
class AboutRegularExpressions < Neo::Koan
def test_a_pattern_is_a_regular_expression
assert_equal Regexp, /pattern/.class
end
def test_a_regexp_can_search_a_string_for_matching_content
assert_equal "match", "some matching content"[/match/]
end
def test_a_failed_match_returns_nil
assert_equal nil, "some matching content"[/missing/]
end
# ------------------------------------------------------------------
def test_question_mark_means_optional
assert_equal "ab", "abbcccddddeeeee"[/ab?/]
assert_equal "a", "abbcccddddeeeee"[/az?/]
end
def test_plus_means_one_or_more
assert_equal "bccc", "abbcccddddeeeee"[/bc+/]
end
def test_asterisk_means_zero_or_more
assert_equal "abb", "abbcccddddeeeee"[/ab*/]
assert_equal "a", "abbcccddddeeeee"[/az*/]
assert_equal "", "abbcccddddeeeee"[/z*/]
# THINK ABOUT IT:
#
# When would * fail to match?
# Possible answer:
# When it fails to find any occurence of the pattern, it will return a nil.
end
# THINK ABOUT IT:
#
# We say that the repetition operators above are "greedy."
#
# Why?
# Answer:
# From RubyDocs:
# Repetition is greedy by default: as many occurrences as possible are matched while still allowing the overall match to succeed. By contrast, lazy matching makes the minimal amount of matches necessary for overall success. A greedy metacharacter can be made lazy by following it with ?.
# Extra:
# A quantifier followed by + matches possessively: once it has matched it does not backtrack. They behave like greedy quantifiers, but having matched they refuse to “give up” their match even if this jeopardises the overall match.
# ------------------------------------------------------------------
def test_the_left_most_match_wins
assert_equal "a", "abbccc az"[/az*/]
end
# ------------------------------------------------------------------
def test_character_classes_give_options_for_a_character
animals = ["cat", "bat", "rat", "zat"]
assert_equal ["cat", "bat", "rat"], animals.select { |a| a[/[cbr]at/] }
end
# Note:
# [] in the a[/[]/] is a character-class
def test_slash_d_is_a_shortcut_for_a_digit_character_class
assert_equal "42", "the number is 42"[/[0123456789]+/]
assert_equal "42", "the number is 42"[/\d+/]
end
def test_character_classes_can_include_ranges
assert_equal "42", "the number is 42"[/[0-9]+/]
end
def test_slash_s_is_a_shortcut_for_a_whitespace_character_class
assert_equal " \t\n", "space: \t\n"[/\s+/]
end
def test_slash_w_is_a_shortcut_for_a_word_character_class
# NOTE: This is more like how a programmer might define a word.
assert_equal "variable_1", "variable_1 = 42"[/[a-zA-Z0-9_]+/]
assert_equal "variable_1", "variable_1 = 42"[/\w+/]
end
def test_period_is_a_shortcut_for_any_non_newline_character
assert_equal "abc", "abc\n123"[/a.+/]
end
def test_a_character_class_can_be_negated
assert_equal "the number is ", "the number is 42"[/[^0-9]+/]
end
# Note:
# ^ skips the pattern following it.
def test_shortcut_character_classes_are_negated_with_capitals
assert_equal "the number is ", "the number is 42"[/\D+/]
assert_equal "space:", "space: \t\n"[/\S+/]
# ... a programmer would most likely do
assert_equal " = ", "variable_1 = 42"[/[^a-zA-Z0-9_]+/]
assert_equal " = ", "variable_1 = 42"[/\W+/]
end
# ------------------------------------------------------------------
def test_slash_a_anchors_to_the_start_of_the_string
assert_equal "start", "start end"[/\Astart/]
assert_equal nil, "start end"[/\Aend/]
end
def test_slash_z_anchors_to_the_end_of_the_string
assert_equal "end", "start end"[/end\z/]
assert_equal nil, "start end"[/start\z/]
end
def test_caret_anchors_to_the_start_of_lines
assert_equal "2", "num 42\n2 lines"[/^\d+/]
end
def test_dollar_sign_anchors_to_the_end_of_lines
assert_equal "42", "2 lines\nnum 42"[/\d+$/]
end
def test_slash_b_anchors_to_a_word_boundary
assert_equal "vines", "bovine vines"[/\bvine./]
end
# ------------------------------------------------------------------
def test_parentheses_group_contents
assert_equal "hahaha", "ahahaha"[/(ha)+/]
end
# ------------------------------------------------------------------
def test_parentheses_also_capture_matched_content_by_number
assert_equal "Gray", "Gray, James"[/(\w+), (\w+)/, 1]
assert_equal "James", "Gray, James"[/(\w+), (\w+)/, 2]
end
def test_variables_can_also_be_used_to_access_captures
assert_equal "Gray, James", "Name: Gray, James"[/(\w+), (\w+)/]
assert_equal "Gray", $1
assert_equal "James", $2
end
# ------------------------------------------------------------------
def test_a_vertical_pipe_means_or
grays = /(James|Dana|Summer) Gray/
assert_equal "James Gray", "James Gray"[grays]
assert_equal "Summer", "Summer Gray"[grays, 1]
assert_equal nil, "Jim Gray"[grays, 1]
end
# THINK ABOUT IT:
#
# Explain the difference between a character class ([...]) and alternation (|).
# Tough one. Answer:
# From StackOverflow:
# Character classes ([...]) are optimized for matching one out of some set of characters, and alternatives (x|y) allow for more general choices of varying lengths. You will tend to see better performance if you keep these design principles in mind. Regex implementations transform source code such as /[abc]/ into finite-state automata, usually NFAs. What we think of as regex engines are more-or-less bookkeepers that assist execution of those target state machines. The sufficiently smart regex compiler will generate the same machine code for equivalent regexes, but this is difficult and expensive in the general case because of the lurking exponential complexity.
# ------------------------------------------------------------------
def test_scan_is_like_find_all
assert_equal ["one", "two", "three"], "one two-three".scan(/\w+/)
end
def test_sub_is_like_find_and_replace
assert_equal "one t-three", "one two-three".sub(/(t\w*)/) { $1[0, 1] }
end
def test_gsub_is_like_find_and_replace_all
assert_equal "one t-t", "one two-three".gsub(/(t\w*)/) { $1[0, 1] }
end
end