@@ -21,11 +21,13 @@ export NUMCOLS=36
21
21
# #############################################################################
22
22
# extract locations, past and present, from CSpace
23
23
# #############################################################################
24
- time psql -F $' \t ' -R" @@" -A -U $USERNAME -d " $CONNECTSTRING " -f locations1.sql -o m1.csv
25
- time psql -F $' \t ' -R" @@" -A -U $USERNAME -d " $CONNECTSTRING " -f locations2.sql -o m2.csv
24
+ time psql -F $' \t ' -R" @@" -A -U $USERNAME -d " $CONNECTSTRING " -f locations1.sql -o m1.csv &
25
+ time psql -F $' \t ' -R" @@" -A -U $USERNAME -d " $CONNECTSTRING " -f locations2.sql -o m2.csv &
26
+ wait
26
27
# cleanup newlines and crlf in data, then switch record separator.
27
- time perl -pe ' s/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv
28
- time perl -pe ' s/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv
28
+ time perl -pe ' s/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv &
29
+ time perl -pe ' s/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv &
30
+ wait
29
31
rm m1.csv m2.csv
30
32
# #############################################################################
31
33
# stitch the two files together
36
38
rm m1a.csv m2a.csv
37
39
time join -j 1 -t $' \t ' m1a.sort.csv m2a.sort.csv > m3.sort.csv
38
40
rm m1a.sort.csv m2a.sort.csv
39
- cut -f1-5,10-14 m3.sort.csv > m4.csv
41
+ cut -f1-5,7- m3.sort.csv > m4.csv
40
42
# #############################################################################
41
43
# we want to recover and use our "special" solr-friendly header, which got buried
42
44
# #############################################################################
43
- grep -P " ^csid_s\t" m4.csv > header4Solr.csv
44
- grep -v -P " ^csid_s\t" m4.csv > m5.csv
45
+ grep -P " ^csid_s\t" m4.csv > header4Solr.csv &
46
+ grep -v -P " ^csid_s\t" m4.csv > m5.csv &
47
+ wait
45
48
cat header4Solr.csv m5.csv > m4.csv
46
49
rm m5.csv m3.sort.csv
47
- time perl -ne " \$ x = \$ _ ;s/[^\t]//g; if (length eq 8) { print \$ x;}" m4.csv > 4solr.${TENANT} .locations.csv
50
+ # #############################################################################
51
+ # count the types and tokens in the final file
52
+ # #############################################################################
53
+ time python evaluate.py m4.csv 4solr.${TENANT} .locations.csv > counts.locations.csv
48
54
# #############################################################################
49
55
# ok, now let's load this into solr...
50
56
# clear out the existing data
@@ -56,14 +62,10 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-locations/update" --data '<comm
56
62
# note, among other things, the overriding of the encapsulator with \
57
63
# #############################################################################
58
64
time curl -X POST -s -S ' http://localhost:8983/solr/pahma-locations/update/csv?commit=true&header=true&trim=true&separator=%09&encapsulator=\' -T 4solr.pahma.locations.csv -H ' Content-type:text/plain; charset=utf-8' &
59
- # #############################################################################
60
- # count the types and tokens in the final file
61
- # #############################################################################
62
- time python evaluate.py 4solr.$TENANT .locations.csv /dev/null > counts.locations.csv &
63
65
# count blobs
64
- cut -f67 4solr.${TENANT} .public .csv | grep -v ' blob_ss' | perl -pe ' s/\r//' | grep . | wc -l > counts.public .blobs.csv
65
- cut -f67 4solr.${TENANT} .public .csv | perl -pe ' s/\r//;s/,/\n/g' | grep -v ' blob_ss' | grep . | wc -l >> counts.public .blobs.csv
66
- cp counts.public .blobs.csv /tmp/$TENANT .counts.public .csv
66
+ cut -f67 4solr.${TENANT} .locations .csv | grep -v ' blob_ss' | perl -pe ' s/\r//' | grep . | wc -l > counts.locations .blobs.csv
67
+ cut -f67 4solr.${TENANT} .locations .csv | perl -pe ' s/\r//;s/,/\n/g' | grep -v ' blob_ss' | grep . | wc -l >> counts.locations .blobs.csv
68
+ cp counts.locations .blobs.csv /tmp/$TENANT .counts.locations .csv
67
69
# get rid of intermediate files
68
70
rm m4.csv
69
71
wait
0 commit comments