Skip to content

Commit

Permalink
Spruce up the bill geocoder some
Browse files Browse the repository at this point in the history
This still uses absolutely ancient code, and is missing a bunch of sanity-checking, but it actually works again, so that's something.
  • Loading branch information
waldoj committed Jan 7, 2024
1 parent 8dcc7f5 commit 1258a18
Showing 1 changed file with 173 additions and 130 deletions.
303 changes: 173 additions & 130 deletions cron/update_places.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
/*
DANGER, WILL ROBINSON!
If a bill has no placenames in it, we never mark it as being name-less. The result is that we run
a query against Yahoo over and over and over, always getting no results. That costs $6/1,000, so
that's probably something we'll want to prevent.
a query over and over and over, always getting no results.
*/

# INCLUDES
Expand All @@ -18,8 +17,10 @@
# page.
connect_to_db();

# Select all bills that contain a phrase concerning geography for which we don't already have
# location records stored.
/*
* Select all bills that contain a phrase concerning geography for which we don't already have
* location records stored.
*/
$sql = 'SELECT bills.id, bills.number, bills.full_text, sessions.year
FROM bills
LEFT JOIN sessions
Expand All @@ -40,162 +41,204 @@
ORDER BY RAND()
LIMIT 10';
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0)
if (mysql_num_rows($result) == 0)
{
return;
}

/*
* Connect to Memcached, as we may well be interacting with it during this session.
*/
$mc = new Memcached();
$mc->addServer(MEMCACHED_SERVER, MEMCACHED_PORT);
/*
* Connect to Memcached, as we may well be interacting with it during this session.
*/
$mc = new Memcached();
$mc->addServer(MEMCACHED_SERVER, MEMCACHED_PORT);

# Set up cURL for the queries to follow.
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_HTTPHEADER, 'Authorization: apiKey ' . GEOPARSER_KEY);
curl_setopt($ch, CURLOPT_POST, 1);
/*
* Set up for queries to OpenAI
*/
$api_key = OPENAI_KEY;
$endpoint = 'https://api.openai.com/v1/chat/completions';
$role = 'You are a helpful assistant who identifies the names of places mentioned in '
. 'the languages of legislation before the Virginia General Assembly. Given the text '
. 'of a bill, you will extract the name of every Virginia county, city, and town mentioned, '
. 'creating a list of them. You will separate each place name by commas. You will use the full name, of each place '
. '(e.g. "City of Fairfax," "County of Fairfax," or "Town of Scottsville.") If no places '
. 'are in the text at all, remain silent.' . "\n\n";

/*
* Create an initial connection to the endpoint, to be reused on each loop
*/
$ch = curl_init($endpoint);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Content-Type: application/json',
'Authorization: Bearer ' . $api_key
]);

/*
* Iterate through the bills.
*/
while ($bill = mysql_fetch_array($result))
{

$bill = array_map('stripslashes', $bill);

/*
* Iterate through the bills.
* Get bill information from the API, take all of the text that's changing, and put it into
* a single string. If there's no diff of changed text, then use the bill's full text.
*/
while ($bill = mysql_fetch_array($result))
$bill_info = file_get_contents('https://api.richmondsunlight.com/1.1/bill/' . $bill['year'] . '/' . $bill['number'] . '.json');
if ($bill_info == FALSE)
{

$bill = array_map('stripslashes', $bill);
$url = 'https://geoparser.io/api/geoparser';

/*
* Get bill information from the API, take all of the text that's changing, and put it into
* a single string. If there's no diff of changed text, then use the bill's full text.
*/
$bill_info = file_get_contents('https://api.richmondsunlight.com/1.1/bill/' . $bill['year'] . '/' . $bill['number'] . '.json');
if ($bill_info == FALSE)
{
continue;
}
$bill_info = json_decode($bill_info);
if ( empty($bill_info->changes) || count($bill_info->changes == 0) )
{
$bill_place_text = strip_tags($bill_info->full_text);
}
else
continue;
}
$bill_info = json_decode($bill_info);
if ( empty($bill_info->changes) || count($bill_info->changes) == 0 )
{
$prompt = strip_tags($bill_info->full_text);
}
else
{
$prompt = '';
foreach ($bill_info->changes as $change)
{
$bill_place_text = '';
foreach ($bill_info->changes as $change)
{
$bill_place_text .= $change->text .= "\n";
}
$prompt .= $change->text .= "\n";
}
}

/*
* Define our fields.
*/
$fields = array(
'inputText' => $bill_place_text,
);
foreach ($fields as $key=>$value)
if (strlen($prompt) < 8)
{
continue;
}

$data = [
'model' => 'gpt-4-1106-preview',
'messages' => [
['role' => 'system', 'content' => $role],
['role' => 'user', 'content' => 'Please extract place names from the following text: '
. $prompt]
]
];

/*
* Submit query
*/
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
$response = curl_exec($ch);
if (curl_errno($ch))
{
echo 'cURL error: ' . curl_error($ch);
$log->put('ERROR: Could not query OpenAI API, with this failure: ' . curl_error($ch), 3);
}

/*
* Use the response
*/
$response = json_decode($response, true);
if (!isset($response['choices'][0]['message']['content']))
{
continue;
}

$generated_text = trim($response['choices'][0]['message']['content']);

/*
* These responses indicate that ChatGPT hasn't found any place names.
*/
if (empty($generated_text))
{
continue;
}
$negative_responses = [ 'mentions', 'mentioned', 'provided text', 'specific'];
foreach ($negative_responses as $negative_response)
{
if (stripos($generated_text, $negative_response) !== false)
{
$query_string .= $key.'='.urlencode($value).'&';
continue(2);
}
$query_string = rtrim($query_string,'&');
}

/*
* Tell cURL the URL to which we'll be POSTing.
*/
curl_setopt($ch, CURLOPT_URL, $url);
$places = explode(', ', $generated_text);

/*
* Indicate the number of fields that we'll be providing content for.
*/
curl_setopt($ch, CURLOPT_POST, count($fields));
/*
* Iterate through each returned place
*/
foreach ($places as $place)
{

/*
* Pass the POST data.
* We need different queries for different types of municipalities
*/
curl_setopt($ch, CURLOPT_POSTFIELDS, $query_string);
if (stripos($place, 'County') !== false)
{

/*
* Get the data from cURL.
*/
ob_start();
curl_exec($ch);
curl_close($ch);
$json = ob_get_contents();
ob_end_clean();
if (stripos($place, 'County of') !== false)
{
$place = preg_replace('/County of (.+)/', '$1 County', $place);
}

if ($json == FALSE)
$sql = 'SELECT latitude, longitude
FROM gazetteer
WHERE
name="' . $place . '" AND
municipality IS NULL';
}
elseif (stripos($place, 'City of ') !== false)
{
continue;
$place = str_replace('City of ', '', $place);
$sql = 'SELECT latitude, longitude
FROM gazetteer
WHERE
name="' . $place . '" AND
municipality IS NOT NULL';
}
elseif (stripos($place, 'Town of ') !== false)
{
$place = str_replace('Town of ', '', $place);
$sql = 'SELECT latitude, longitude
FROM gazetteer
WHERE
name="' . $place . '" AND
municipality IS NOT NULL';
}

$yahoo_response = json_decode($json, true);
$town_result = mysql_query($sql);

if ( ($yahoo_response == FALSE) || !isset($yahoo_response['document']) )
/*
* If there's no result, or if there's more than one result (which we have no way to)
* pick between, skip this town.
*/
if ( ($town_result == false) || (mysql_num_rows($town_result) > 1) )
{
continue;
}

echo '<h1>'.$bill['year'].' '.$bill['number'].'</h1>';

foreach ($yahoo_response['document'] as $key => $response)
$coordinates = mysql_fetch_array($town_result);

$sql = 'INSERT INTO bills_places
SET
bill_id=' . $bill['id'] . ',
placename="' . addslashes($place) . '",
latitude=' . $coordinates['latitude'] . ',
longitude=' . $coordinates['longitude'];
$place_result = mysql_query($sql);
if ($place_result == false)
{

# Mixed in with the named keys are numbered keys, and the numbered keys contain the
# useful results. We just skip the non-numbered keys.
if (!is_numeric($key))
{
continue;
}

echo '<h2>'.$key.'</h2>';
echo '<pre>'.print_r($response, true).'</pre>';

# If this found place isn't in the state of Virginia, we have no interest in it. Or if
# it's the phrase "Commonwealth of Virginia" resulting in the town of Commonwealth being
# looked up. Or if it's the town of Marshall, that's just a bill introduced by Bob
# or Danny Marshall.
if (
(strpos($response['placeDetails']['place']['name'], ', VA, US') === false)
||
(strpos($response['placeDetails']['place']['name'], 'Commonwealth') !== false)
||
(['placeDetails']['place']['name'] == 'Marshall')
)
{
continue;
}

echo '<p>Extracting location from response.</p>';

$place['latitude'] = $response['placeDetails']['place']['centroid']['latitude'];
$place['longitude'] = $response['placeDetails']['place']['centroid']['longitude'];
$place['name'] = str_replace(', VA, US', '', $response['placeDetails']['place']['name']);

echo '<pre>'.print_r($place, true).'</pre>';

////////////////////////
// * Duplicates happen. You'd think Yahoo would filter them out, but they do not. Either
// unique the data that's going to be stored pre-storage or modify the DB to be OK
// with this.
////////////////////////
$sql = 'INSERT INTO bills_places
SET bill_id='.$bill['id'].', placename="'.addslashes($place['name']).'",
latitude='.$place['latitude'].', longitude='.$place['longitude'];
mysql_query($sql);
echo '<p>'.$sql.'</p>';

/*
* Clear the bill from Memcached.
*/
$mc->delete('bill-' . $bill['id']);

unset($place);

//$log->put('Error: Could not add place names for ' . strtoupper($bill['number']), 4);
}

}

# Shut down the cURL connection.
curl_close($ch);
/*
* Clear the bill from Memcached.
*/
//$mc->delete('bill-' . $bill['id']);

//$log->put('Identified place names in ' . strtoupper($bill['number']), 2);

}

/*
* Shut down the cURL connection.
*/
curl_close($ch);

0 comments on commit 1258a18

Please sign in to comment.