CC-11: repair and improve blob counting metrics

jblowe · jblowe · commit 52747e2a8cf3 · 2019-03-19T15:22:03.000-07:00
diff --git a/datasources/bampfa/solrETL-internal.sh b/datasources/bampfa/solrETL-internal.sh
@@ -51,9 +51,9 @@ time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-internal/update/cs
 time python evaluate.py 4solr.$TENANT.internal.csv /dev/null > counts.internal.csv &
 # get rid of intermediate files
 rm d?.csv m?.csv b?.csv media.csv metadata.csv &
-cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.internal.blobs.csv
-cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
-cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.csv
+cut -f43 4solr.${TENANT}.internal.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.internal.blobs.csv
+cut -f43 4solr.${TENANT}.internal.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
+cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.blobs.csv
 cat counts.internal.blobs.csv
 wait
 # zip up .csvs, save a bit of space on backups
diff --git a/datasources/bampfa/solrETL-public.sh b/datasources/bampfa/solrETL-public.sh
@@ -69,7 +69,7 @@ time python evaluate.py 4solr.$TENANT.public.csv /dev/null > counts.public.csv &
 rm d?.csv m?.csv b?.csv media.csv metadata.csv &
 cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.public.blobs.csv
 cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
-cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
+cp counts.public.blobs.csv /tmp/$TENANT.counts.public.blobs.csv
 cat counts.public.blobs.csv
 wait
 cp counts.public.csv /tmp/$TENANT.counts.public.csv
diff --git a/datasources/pahma/solrETL-locations.sh b/datasources/pahma/solrETL-locations.sh
@@ -21,11 +21,13 @@ export NUMCOLS=36
 ##############################################################################
 # extract locations, past and present, from CSpace
 ##############################################################################
-time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv
-time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv
+time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv &
+time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv &
+wait
 # cleanup newlines and crlf in data, then switch record separator.
-time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv
-time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv
+time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv &
+time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv &
+wait
 rm m1.csv m2.csv
 ##############################################################################
 # stitch the two files together
@@ -36,15 +38,19 @@ wait
 rm m1a.csv m2a.csv
 time join -j 1 -t $'\t' m1a.sort.csv m2a.sort.csv > m3.sort.csv
 rm m1a.sort.csv m2a.sort.csv
-cut -f1-5,10-14 m3.sort.csv > m4.csv
+cut -f1-5,7- m3.sort.csv > m4.csv
 ##############################################################################
 # we want to recover and use our "special" solr-friendly header, which got buried
 ##############################################################################
-grep -P "^csid_s\t" m4.csv > header4Solr.csv
-grep -v -P "^csid_s\t" m4.csv > m5.csv
+grep -P "^csid_s\t" m4.csv > header4Solr.csv &
+grep -v -P "^csid_s\t" m4.csv > m5.csv &
+wait
 cat header4Solr.csv m5.csv > m4.csv
 rm m5.csv m3.sort.csv
-time perl -ne " \$x = \$_ ;s/[^\t]//g; if (length eq 8) { print \$x;}" m4.csv > 4solr.${TENANT}.locations.csv
+##############################################################################
+# count the types and tokens in the final file
+##############################################################################
+time python evaluate.py m4.csv 4solr.${TENANT}.locations.csv > counts.locations.csv
 ##############################################################################
 # ok, now let's load this into solr...
 # clear out the existing data
@@ -56,14 +62,10 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-locations/update" --data '<comm
 # note, among other things, the overriding of the encapsulator with \
 ##############################################################################
 time curl -X POST -s -S 'http://localhost:8983/solr/pahma-locations/update/csv?commit=true&header=true&trim=true&separator=%09&encapsulator=\' -T 4solr.pahma.locations.csv -H 'Content-type:text/plain; charset=utf-8' &
-##############################################################################
-# count the types and tokens in the final file
-##############################################################################
-time python evaluate.py 4solr.$TENANT.locations.csv /dev/null > counts.locations.csv &
 # count blobs
-cut -f67 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.public.blobs.csv
-cut -f67 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
-cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
+cut -f67 4solr.${TENANT}.locations.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.locations.blobs.csv
+cut -f67 4solr.${TENANT}.locations.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.locations.blobs.csv
+cp counts.locations.blobs.csv /tmp/$TENANT.counts.locations.csv
 # get rid of intermediate files
 rm m4.csv
 wait
diff --git a/datasources/pahma/solrETL-osteology.sh b/datasources/pahma/solrETL-osteology.sh
@@ -54,8 +54,8 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-osteology/update" --data '<comm
 time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-osteology/update/csv?commit=true&header=true&separator=%09&f.taxon_ss.split=true&f.taxon_ss.separator=%7C&f.objculturedepicted_ss.split=true&f.objculturedepicted_sss.separator=%7C&f.objplacedepicted_ss.split=true&f.objplacedepicted_ss.separator=%7C&f.objpersondepicted_ss.split=true&f.objpersondepicted_ss.separator=%7C&f.status_ss.split=true&f.status_ss.separator=%7C&f.audio_md5_ss.split=true&f.audio_md5_ss.separator=%7C&f.blob_md5_ss.split=true&f.blob_md5_ss.separator=%7C&f.card_md5_ss.split=true&f.card_md5_ss.separator=%7C&f.x3d_md5_ss.split=true&f.x3d_md5_ss.separator=%7C&f.x3d_csid_ss.split=true&f.x3d_csid_ss.separator=%7C&f.video_md5_ss.split=true&f.video_md5_ss.separator=%7C&f.aggregate_ss.split=true&f.aggregate_ss.separator=%2C&f.objpp_ss.split=true&f.objpp_ss.separator=%7C&f.anonymousdonor_ss.split=true&f.anonymousdonor_ss.separator=%7C&f.objaltnum_ss.split=true&f.objaltnum_ss.separator=%7C&f.objfilecode_ss.split=true&f.objfilecode_ss.separator=%7C&f.objdimensions_ss.split=true&f.objdimensions_ss.separator=%7C&f.objmaterials_ss.split=true&f.objmaterials_ss.separator=%7C&f.objinscrtext_ss.split=true&f.objinscrtext_ss.separator=%7C&f.objcollector_ss.split=true&f.objcollector_ss.separator=%7C&f.objaccno_ss.split=true&f.objaccno_ss.separator=%7C&f.objaccdate_ss.split=true&f.objaccdate_ss.separator=%7C&f.objacqdate_ss.split=true&f.objacqdate_ss.separator=%7C&f.objassoccult_ss.split=true&f.objassoccult_ss.separator=%7C&f.objculturetree_ss.split=true&f.objculturetree_ss.separator=%7C&f.objfcptree_ss.split=true&f.objfcptree_ss.separator=%7C&f.grouptitle_ss.split=true&f.grouptitle_ss.separator=%7C&f.objmaker_ss.split=true&f.objmaker_ss.separator=%7C&f.objaccdate_begin_dts.split=true&f.objaccdate_begin_dts.separator=%7C&f.objacqdate_begin_dts.split=true&f.objacqdate_begin_dts.separator=%7C&f.objaccdate_end_dts.split=true&f.objaccdate_end_dts.separator=%7C&f.objacqdate_end_dts.split=true&f.objacqdate_end_dts.separator=%7C&f.blob_ss.split=true&f.blob_ss.separator=%7C&f.card_ss.split=true&f.card_ss.separator=%7C&f.imagetype_ss.split=true&f.imagetype_ss.separator=%7C&encapsulator=\\" -T 4solr.${TENANT}.osteology.csv -H 'Content-type:text/plain; charset=utf-8' &
 rm o?.csv header4Solr.csv
 # count blobs
-cut -f78 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.osteology.blobs.csv
-cut -f78 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
+cut -f78 4solr.${TENANT}.osteology.csv | grep -v 'blob_ss' |perl -pe 's/\r//' |  grep . | wc -l > counts.osteology.blobs.csv
+cut -f78 4solr.${TENANT}.osteology.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
 wait
 cp counts.osteology.blobs.csv /tmp/$TENANT.counts.osteology.blobs.csv
 cat counts.osteology.blobs.csv