Skip to content

Commit

Permalink
Take a first crack at implementing bulk summary data
Browse files Browse the repository at this point in the history
Toward #39. [ci skip]
  • Loading branch information
waldoj committed Jan 6, 2018
1 parent 2039a15 commit 6d9a3f4
Showing 1 changed file with 221 additions and 0 deletions.
221 changes: 221 additions & 0 deletions cron/summaries_new.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
<?php

###
# UPDATE BILL SUMMARIES
###

/*
* Fetch the CSV file.
*/
$summaries = get_content('ftp://' . LIS_FTP_USERNAME . ':' . LIS_FTP_PASSWORD
. '@legis.state.va.us/fromdlas/csv' . $dlas_session_id . '/Summaries.csv');
if (!$summaries || empty($summaries))
{
$log->put('Summaries.csv doesn’t exist on legis.state.va.us.', 8);
return FALSE;
}

# If the MD5 value of the new file is the same as the saved file, then there's nothing to update.
if (md5($summaries) == md5_file('summaries.csv'))
{
$log->put('Not updating summaries, because summaries.csv has not been modified since it was last downloaded.', 2);
return FALSE;
}

/*
* Remove any white space.
*/
$summaries = trim($summaries);

/*
* Save the summaries locally.
*/
if (file_put_contents(__DIR__ . '/summaries.csv', $summaries) === FALSE)
{
$log->put('summaries.csv could not be saved to the filesystem.', 8);
return FALSE;
}

/*
* Open the resulting file.
*/
$fp = fopen(__DIR__ . '/summaries.csv','r');
if ($fp === FALSE)
{
$log->put('summaries.csv could not be read from the filesystem.', 8);
return FALSE;
}

/*
* Also, retrieve our saved serialized array of hash data, so that we can only update or insert
* summaries that have changed, or that are new.
*/
$hash_path = __DIR__ . '/hashes/summaries-' . SESSION_ID . '.md5';
if (file_exists($hash_path))
{
$hashes = file_get_contents($hash_path);
if ($hashes !== FALSE)
{
$hashes = unserialize($hashes);
}
else
{
$hashes = array();
}
}
else
{
if (!file_exists(__DIR__ . '/hashes/'))
{
mkdir(__DIR__ . '/hashes');
}
$hashes = array();
}

/*
* Generate a list of all bills and their numbers, to use to make comparisons.
*/
$sql = 'SELECT bills.id, bills.number
FROM bills
WHERE session_id = ' . $session_id;
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0)
{
$bills = array();
while ($bill = mysql_fetch_array($result))
{
$tmp = array($bill['number'] => $bill['id']);
$bills[] = $tmp;
}
}

/*
* Set a flag that will allow us to ignore the header row.
*/
$first = 'yes';

/*
* Step through each row in the CSV file, one by one.
*/
while (($summary = fgetcsv($fp, 1000, ',')) !== FALSE)
{

# If this is something other than a header row, parse it.
if (isset($first))
{
unset($first);
continue;
}

/*
* Rename each field to something reasonable.
*/
$new_headers = array(
'number' => 'SUM_BILNO',
'doc_id' => 'SUMMARY_DOCID',
'type' => 'SUMMARY_TYPE',
'text' => 'SUMMARY_TEXT'
);
foreach ($new_headers as $new => $old)
{
$summary[$new] = $summary[$old];
unset($summary[$old]);
}

/*
* Change the format of the bill number. In this file, the numeric portions are left-padded
* with zeros, so that e.g. HB1 is rendered as HB0001. Here we change them to e.g. HB1.
*/
$suffix = substr($bill['number'], 2, -1) + 0;
$bill['number'] = substr($bill['number'], 0, 2) . $suffix;


/*
* Before we proceed any farther, see if this record is either new or different than last
* time that we examined it.
*/
$hash = md5(serialize($summary));
$number = strtolower(trim($summary[0]));

if ( isset($hashes[$number]) && ($hash == $hashes[$number]) )
{
continue;
}
else
{

$hashes[$number] = $hash;
if (!isset($hashes[$number]))
{
$log->put('Adding summary ' . strtoupper($number) . '.', 2);
}
else
{
$log->put('Updating summary ' . strtoupper($number) . '.', 1);
}

}

/*
* Remove the paragraph tags, newlines, NBSPs and double spaces.
*/
$summary['text'] = str_replace("\r", ' ', $summary['text']);
$summary['text'] = str_replace("\n", ' ', $summary['text']);
$summary['text'] = str_replace('&nbsp;', ' ', $summary['text']);
$summary['text'] = str_replace(' ', ' ', $summary['text']);
$summary['text'] = str_replace('\u00a0', ' ', $summary['text']);

# There is often an HTML mistake in this tag, so we perform this replacement after
# running HTML Purifier, not before.
$summary['text'] = str_replace('<br clear="all" /> ', ' ', $summary['text']);
$summary['text'] = strip_tags($summary['text'], '<b><i><em><strong>');

# Run the summary through HTML Purifier.
$config = HTMLPurifier_Config::createDefault();
$purifier = new HTMLPurifier($config);
$summary['text'] = $purifier->purify($summary['text']);

# Clean up the bolding, so that we don't bold a blank space.
$summary['text'] = str_replace(' </b>', '</b> ', $summary['text']);

# Trim off any whitespace.
$summary['text'] = trim($summary['text']);

# Hack off a hanging non-breaking space, if there is one.
if (substr($summary['text'], -7) == ' &nbsp;')
{
$summary['text'] = substr($summary['text'], 0, -8);
}

# Put the data back into the database.
if (!empty($summary['text']))
{

$sql = 'UPDATE bills
SET summary="' . mysql_real_escape_string($summary['text']) . '"
WHERE id="' . $bills[$bill{number}] . '"
AND session_id = ' . $session_id;
$result = mysql_query($sql);
if (!$result)
{
$log->put('Insertion of '. strtoupper($bill['number']) . ' summary failed.', 6);
}
else
{
$log->put('Insertion of '. strtoupper($bill['number']) . ' summary succeeded.', 1);
}

}
else
{
$log->put('Summary of ' . strtoupper($bill['number']) . ' is blank.', 2);
}

} // end looping through lines in this CSV file

# Close the CSV file.
fclose($fp);

# Store our per-bill hashes array to a file, so that we can open it up next time and see which
# bills have changed.
file_put_contents($hash_path, serialize($hashes));

0 comments on commit 6d9a3f4

Please sign in to comment.