-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.php
115 lines (92 loc) · 2.65 KB
/
parser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
<?php
/*
* Our time zone. (This is, of course, EST, but we have to define this to keep PHP from
* complaining.)
*/
date_default_timezone_set('America/New_York');
/*
* Include the Simple HTML DOM Parser.
*/
include('class.simple_html_dom.inc.php');
/*
* Include the Chichester library.
*/
include('class.Chichester.inc.php');
/*
* Include the Subsection Identifier library.
*/
include('class.SubsectionIdentifier.inc.php');
/*
* Get a list of every section that we already have a copy of.
*/
$existing_sections = scandir('output/sections/');
foreach ($existing_sections as &$section)
{
$section = str_replace('.json', '', $section);
}
/*
* Create a new instance of our parser.
*/
$chichester = new Chichester();
/*
* Fetch and save the table of contents page.
*/
$chichester->parse_toc();
file_put_contents('output/agencies.json', json_encode($chichester->agencies));
/*
* Iterate through the agency list and retrieve each agency's TOC (section list).
*/
foreach ($chichester->agencies as $agency)
{
echo $agency->name . PHP_EOL;
$chichester->agency_id = $agency->toc_id;
/*
* Retrieve the list of sections for this agency.
*/
try
{
$chichester->parse_agency();
}
catch (Exception $e)
{
echo ('Fatal error for agency ' . $chichester->agency_id . ': ' . $e->getMessage());
}
// We're getting duplicate agency records. For instance, the ABC (agency
file_put_contents('output/agency-' . $agency->toc_id . '.json', json_encode($chichester->sections));
/*
* Now iterate through each section in this agency.
*/
foreach ($chichester->sections as $section)
{
/*
* If we already have a copy of this section, skip it.
*/
$components = explode('+', $section->official_url);
$section_number = $components[2];
if (in_array($section_number, $existing_sections))
{
echo 'Skipping ' . $section_number . PHP_EOL;
continue;
}
// THIS IS A MISTAKE. Ultimately, we even want to save repealed and remove sections.
if ( ($section->repealed === FALSE) && ($section->removed === FALSE) )
{
$chichester->url = $section->official_url;
$chichester->fetch_html();
$chichester->parse_section();
file_put_contents('output/sections/' . $chichester->section->section_number . '.json',
json_encode($chichester->section));
echo '* ' . $chichester->section->section_number . PHP_EOL;
}
/*
* Sleep for .51 seconds. If we don't do this, we'll be locked out of leg1.state.va.us,
* which limits requests to 30 per 60 seconds.
*/
usleep(510000);
}
}
// store the now-complete TOC as a JSON file
// iterate through the TOC
// fetch the section
// parse the section
// store the section as a JSON file