-
Notifications
You must be signed in to change notification settings - Fork 7
/
subsection-identifier.php
281 lines (249 loc) · 8.65 KB
/
subsection-identifier.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
<?php
/**
* Subsection Identifier
*
*
* Requires the object $this->text, with each section stored as a child ($this->text->0,
* $this->text->1, etc.) Returns a hierarchically structured, labelled object, named
* $this->structured, with a prefix_hiearchy object (containing an entry for the structural
* identifier of each generation in the structural ancestry, in order), a prefix entry (containing
* the section's structural identifer), and a text entry, with the structural identifier stripped
* off.
*
* PHP version 5
*
* @author Waldo Jaquith <waldo at jaquith.org>
* @copyright 2013 Waldo Jaquith
* @license http://www.gnu.org/licenses/gpl.html GPL 3
* @version 1.0
* @link http://www.statedecoded.com/
* @since 1.0
*
*/
class SubsectionIdentifier
{
function parse()
{
if (!isset($this->text))
{
return false;
}
/*
* Define all possible section prefixes via via regexes -- a letter, number, or series of
* letters that defines an individual subsection of text in a hierarchical fashion. The
* subsection prefix can be in one of eight formats:
*
* A.
* 1.
* a.
* iv.
* (A)
* (1)
* (a)
* (iv)
*
* That, of course, is four formats expressed in two different fashions -- wrapped in
* parentheses or followed by a period and a space. We pair that with a list of all possible
* characters that can appear within that range, which we use to verify the match.
*/
$prefix_candidates = array (
'/[0-9]{1,2}\. /' => range(1, 99),
'/\([0-9]{1,2}\) /' => range(1, 99),
'/[a-z]{1,2}\. /' => range('a', 'z'),
'/\([a-z]{1,2}\) /' => range('a', 'z'),
'/[A-Z]{1,2}\. /' => range('A', 'Z'),
'/\([A-Z]{1,2}\) /' => range('a', 'z'),
'/([xvi]{1,4})\. /' => array('i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx'),
'/\(([xvi]{1,4})\) /' => array('i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx')
);
/*
* Establish a blank prefix structure. We'll build this up and continually modify it to keep
* track of our current complete section number as we iterate through the text.
*/
$prefixes = array();
/*
* If the text is a string, turn it into an object.
*/
if (is_string($this->text))
{
$this->text = (object) explode("\n\n", $this->text);
}
/*
* Deal with each subsection, one at a time.
*/
$i=0;
foreach ($this->text as &$paragraph)
{
/*
* Set aside the first five characters in this section of text. That's the maximum number
* of characters that a prefix can occupy.
*/
$section_fragment = substr($paragraph, 0, 5);
/*
* Iterate through our regex candidates until we find one that matches (if, indeed, one
* does at all).
*/
foreach ($prefix_candidates as $prefix => $prefix_members)
{
/*
* If this prefix isn't found in this section fragment, then proceed to the next
* prefix.
*/
preg_match($prefix, $section_fragment, $matches);
if (count($matches) == 0)
{
continue;
}
/*
* If the section fragment is a Roman numeral "i", but the matched prefix candidate
* is alphabetic, then let's skip this prefix candidate and continue to iterate,
* knowing that we'll get to the Roman numeral prefix candidate soon. We ignore the
* last character, since that could potentially be the first character of the text
* (as opposed to the prefix), and is definitely not the text of our prefix (it
* could be a containing parenthesis, but we're not concerned about that now). We're
* trying to avoid actually matching an "i" if the text is, for example:
*
* "a. in the meaning of..."
*/
if (strpos(substr($section_fragment, 0, -2), 'i'))
{
if ($prefix_members[0] == 'a')
{
continue;
}
}
/*
* Great, we've successfully made a match -- we now know that this is the beginning
* of a new numbered section. First, let's save a platonic ideal of this match.
*/
$match = trim($matches[0]);
/*
* Then we move this matched regex to the beginning of the $prefix_candidates stack,
* so that on our next iteration through we'll start with this one. We do that both
* in the name of efficiency and also to help Roman numerals be identified
* consistently, despite being comprised of letters that might reasonably be
* identified by another regex.
*/
$tmp = $prefix_candidates[$prefix];
unset($prefix_candidates[$prefix]);
$prefix_candidates = array_reverse($prefix_candidates);
$prefix_candidates[$prefix] = $tmp;
$prefix_candidates = array_reverse($prefix_candidates);
/*
* Now we need to figure out what the entire section number is, only the very end of
* which is our actual prefix. To start with, we need to modify our subsection
* structure array to include our current prefix.
*
* If this is our first time through, then this is easy -- our entire structure
* consists of the current prefix.
*/
if (count($prefixes) == 0)
{
$prefixes[] = $match;
}
/*
* But if we already have a prefix stored in our array of prefixes for this section,
* then we need to iterate through and see if there's a match.
*/
else
{
/*
* We must figure out where in the structure our current prefix lives. Iterate
* through the prefix structure and look for anything that matches the regex
* that matched our prefix.
*/
foreach ($prefixes as $key => &$prefix_component)
{
/*
* We include a space after $prefix_component because this regex is looking
* for a space after the prefix, something that would be there when finding
* this match in the context of a section, but of course we've already
* trimmed that out of $prefix_component.
*/
preg_match($prefix, $prefix_component.' ', $matches);
if (count($matches) == 0)
{
continue;
}
/*
* We've found a match! Update our array to reflect the current section
* number, by modifying the relevant prefix component.
*/
$prefix_component = $match;
/*
* Also, set a flag so that we know that we made a match.
*/
$match_made = true;
/*
* If there are more elements in the array after this one, we need to zero
* them out. That is, if we're in A4(c), and our last section was A4(b)6,
* then we need to lop off that "6." So kill everything in the array after
* this.
*/
if (count($prefixes) > $key)
{
$prefixes = array_slice($prefixes, 0, ($key+1));
}
}
/*
* If the $match_made flag hasn't been set, then we know that this is a new
* prefix component, and we can append it to the prefix array.
*/
if (!isset($match_made))
{
$prefixes[] = $match;
}
else
{
unset($match_made);
}
}
/*
* Iterate through the prefix structure and store each prefix section in our text
* object. While we're at it, eliminate any periods.
*/
for ($j=0; $j<count($prefixes); $j++)
{
$output->$i->prefix_hierarchy->$j = str_replace('.', '', $prefixes[$j]);
}
/*
* And store the prefix list as a single string.
*/
$output->$i->prefix = implode('', $prefixes);
}
/*
* Hack off the prefix at the beginning of the text and save what remains to $output.
*/
if (isset($output->$i->prefix))
{
$tmp2 = explode(' ', $paragraph);
unset($tmp2[0]);
$output->$i->text = implode(' ', $tmp2);
}
/*
* If no prefix was identified for this section, then it's a continuation of the prior
* section (in reality, they're probably just paragraphs, not actually "sections").
* Reuse the same section identifier and append the text as-is.
*/
if (!isset($output->$i->prefix) || empty($output->$i->prefix))
{
$output->$i->text = $paragraph;
$output->$i->prefix = $output->{$i-1}->prefix;
$output->$i->prefix_hierarchy = $output->{$i-1}->prefix_hierarchy;
}
/*
* We want to eliminate our matched prefix now, so that we don't mistakenly believe that
* we've successfully made a match on our next loop through.
*/
unset($match);
$i++;
}
/*
* Store the output within the class scope, give it a better name, free up some memory, and
* report its success.
*/
$this->structured = $output;
unset($output);
return true;
}
}