Skip to content

Commit 52747e2

Browse files
committed
CC-11: repair and improve blob counting metrics
1 parent 5ffb361 commit 52747e2

File tree

4 files changed

+23
-21
lines changed

4 files changed

+23
-21
lines changed

datasources/bampfa/solrETL-internal.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-internal/update/cs
5151
time python evaluate.py 4solr.$TENANT.internal.csv /dev/null > counts.internal.csv &
5252
# get rid of intermediate files
5353
rm d?.csv m?.csv b?.csv media.csv metadata.csv &
54-
cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.internal.blobs.csv
55-
cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
56-
cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.csv
54+
cut -f43 4solr.${TENANT}.internal.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.internal.blobs.csv
55+
cut -f43 4solr.${TENANT}.internal.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
56+
cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.blobs.csv
5757
cat counts.internal.blobs.csv
5858
wait
5959
# zip up .csvs, save a bit of space on backups

datasources/bampfa/solrETL-public.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ time python evaluate.py 4solr.$TENANT.public.csv /dev/null > counts.public.csv &
6969
rm d?.csv m?.csv b?.csv media.csv metadata.csv &
7070
cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.public.blobs.csv
7171
cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
72-
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
72+
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.blobs.csv
7373
cat counts.public.blobs.csv
7474
wait
7575
cp counts.public.csv /tmp/$TENANT.counts.public.csv

datasources/pahma/solrETL-locations.sh

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@ export NUMCOLS=36
2121
##############################################################################
2222
# extract locations, past and present, from CSpace
2323
##############################################################################
24-
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv
25-
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv
24+
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv &
25+
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv &
26+
wait
2627
# cleanup newlines and crlf in data, then switch record separator.
27-
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv
28-
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv
28+
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv &
29+
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv &
30+
wait
2931
rm m1.csv m2.csv
3032
##############################################################################
3133
# stitch the two files together
@@ -36,15 +38,19 @@ wait
3638
rm m1a.csv m2a.csv
3739
time join -j 1 -t $'\t' m1a.sort.csv m2a.sort.csv > m3.sort.csv
3840
rm m1a.sort.csv m2a.sort.csv
39-
cut -f1-5,10-14 m3.sort.csv > m4.csv
41+
cut -f1-5,7- m3.sort.csv > m4.csv
4042
##############################################################################
4143
# we want to recover and use our "special" solr-friendly header, which got buried
4244
##############################################################################
43-
grep -P "^csid_s\t" m4.csv > header4Solr.csv
44-
grep -v -P "^csid_s\t" m4.csv > m5.csv
45+
grep -P "^csid_s\t" m4.csv > header4Solr.csv &
46+
grep -v -P "^csid_s\t" m4.csv > m5.csv &
47+
wait
4548
cat header4Solr.csv m5.csv > m4.csv
4649
rm m5.csv m3.sort.csv
47-
time perl -ne " \$x = \$_ ;s/[^\t]//g; if (length eq 8) { print \$x;}" m4.csv > 4solr.${TENANT}.locations.csv
50+
##############################################################################
51+
# count the types and tokens in the final file
52+
##############################################################################
53+
time python evaluate.py m4.csv 4solr.${TENANT}.locations.csv > counts.locations.csv
4854
##############################################################################
4955
# ok, now let's load this into solr...
5056
# clear out the existing data
@@ -56,14 +62,10 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-locations/update" --data '<comm
5662
# note, among other things, the overriding of the encapsulator with \
5763
##############################################################################
5864
time curl -X POST -s -S 'http://localhost:8983/solr/pahma-locations/update/csv?commit=true&header=true&trim=true&separator=%09&encapsulator=\' -T 4solr.pahma.locations.csv -H 'Content-type:text/plain; charset=utf-8' &
59-
##############################################################################
60-
# count the types and tokens in the final file
61-
##############################################################################
62-
time python evaluate.py 4solr.$TENANT.locations.csv /dev/null > counts.locations.csv &
6365
# count blobs
64-
cut -f67 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.public.blobs.csv
65-
cut -f67 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
66-
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
66+
cut -f67 4solr.${TENANT}.locations.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.locations.blobs.csv
67+
cut -f67 4solr.${TENANT}.locations.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.locations.blobs.csv
68+
cp counts.locations.blobs.csv /tmp/$TENANT.counts.locations.csv
6769
# get rid of intermediate files
6870
rm m4.csv
6971
wait

datasources/pahma/solrETL-osteology.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-osteology/update" --data '<comm
5454
time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-osteology/update/csv?commit=true&header=true&separator=%09&f.taxon_ss.split=true&f.taxon_ss.separator=%7C&f.objculturedepicted_ss.split=true&f.objculturedepicted_sss.separator=%7C&f.objplacedepicted_ss.split=true&f.objplacedepicted_ss.separator=%7C&f.objpersondepicted_ss.split=true&f.objpersondepicted_ss.separator=%7C&f.status_ss.split=true&f.status_ss.separator=%7C&f.audio_md5_ss.split=true&f.audio_md5_ss.separator=%7C&f.blob_md5_ss.split=true&f.blob_md5_ss.separator=%7C&f.card_md5_ss.split=true&f.card_md5_ss.separator=%7C&f.x3d_md5_ss.split=true&f.x3d_md5_ss.separator=%7C&f.x3d_csid_ss.split=true&f.x3d_csid_ss.separator=%7C&f.video_md5_ss.split=true&f.video_md5_ss.separator=%7C&f.aggregate_ss.split=true&f.aggregate_ss.separator=%2C&f.objpp_ss.split=true&f.objpp_ss.separator=%7C&f.anonymousdonor_ss.split=true&f.anonymousdonor_ss.separator=%7C&f.objaltnum_ss.split=true&f.objaltnum_ss.separator=%7C&f.objfilecode_ss.split=true&f.objfilecode_ss.separator=%7C&f.objdimensions_ss.split=true&f.objdimensions_ss.separator=%7C&f.objmaterials_ss.split=true&f.objmaterials_ss.separator=%7C&f.objinscrtext_ss.split=true&f.objinscrtext_ss.separator=%7C&f.objcollector_ss.split=true&f.objcollector_ss.separator=%7C&f.objaccno_ss.split=true&f.objaccno_ss.separator=%7C&f.objaccdate_ss.split=true&f.objaccdate_ss.separator=%7C&f.objacqdate_ss.split=true&f.objacqdate_ss.separator=%7C&f.objassoccult_ss.split=true&f.objassoccult_ss.separator=%7C&f.objculturetree_ss.split=true&f.objculturetree_ss.separator=%7C&f.objfcptree_ss.split=true&f.objfcptree_ss.separator=%7C&f.grouptitle_ss.split=true&f.grouptitle_ss.separator=%7C&f.objmaker_ss.split=true&f.objmaker_ss.separator=%7C&f.objaccdate_begin_dts.split=true&f.objaccdate_begin_dts.separator=%7C&f.objacqdate_begin_dts.split=true&f.objacqdate_begin_dts.separator=%7C&f.objaccdate_end_dts.split=true&f.objaccdate_end_dts.separator=%7C&f.objacqdate_end_dts.split=true&f.objacqdate_end_dts.separator=%7C&f.blob_ss.split=true&f.blob_ss.separator=%7C&f.card_ss.split=true&f.card_ss.separator=%7C&f.imagetype_ss.split=true&f.imagetype_ss.separator=%7C&encapsulator=\\" -T 4solr.${TENANT}.osteology.csv -H 'Content-type:text/plain; charset=utf-8' &
5555
rm o?.csv header4Solr.csv
5656
# count blobs
57-
cut -f78 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.osteology.blobs.csv
58-
cut -f78 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
57+
cut -f78 4solr.${TENANT}.osteology.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.osteology.blobs.csv
58+
cut -f78 4solr.${TENANT}.osteology.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
5959
wait
6060
cp counts.osteology.blobs.csv /tmp/$TENANT.counts.osteology.blobs.csv
6161
cat counts.osteology.blobs.csv

0 commit comments

Comments
 (0)