forked from mockturtl/fortune500
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfortune500.sh
executable file
·141 lines (117 loc) · 2.36 KB
/
fortune500.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
ARGC=$#
TAG1="<td class=\"cnncol3\">"
TAG2="<td class=\"cnncol4\">"
REVENUE=revenue.txt
PROFIT=profit.txt
SOURCEFILES=sourcefiles
TARGET=html
if [ -z "$YEAR" ]; then
export YEAR="2012"
fi
#verify input file exists
function checkSources {
if [ ! -f ${SOURCEFILES} ]; then
cat<<EOF
ERROR: No sourcefiles list available
EOF
exit 1
fi
}
#verify files downloaded
function checkTargets {
if [ ! -d $TARGET ]; then
cat<<EOF
ERROR: Target directory '${TARGET}/' does not exist; run with '-d' to download
EOF
exit 2
fi
}
#delete files
function clean {
cat<<EOF
Cleaning old files...
EOF
if [ -d $TARGET ]; then
rm -r $TARGET
fi
}
#fetch webpages
function download {
checkSources
clean
local WROOT="http://money.cnn.com/magazines/fortune/fortune500/${YEAR}/full_list/"
mkdir $TARGET
cat<<EOF
Downloading data for ${YEAR}...
EOF
wget -nv -i ${SOURCEFILES} -B $WROOT
mv *.html $TARGET
}
#remove ignored whitespace
function parse {
checkTargets
checkSources
cat<<EOF
Parsing html...
EOF
./parse.py $SOURCEFILES $TARGET
}
#copy relevant HTML tags to data files, one per line
function extract {
checkSources
cat<<EOF
Exporting revenues to file...
EOF
rm $REVENUE
cat $SOURCEFILES | while read LINE; do
grep "${TAG1}" $TARGET/$LINE >> $REVENUE
done
cat<<EOF
Exporting profits to file...
EOF
rm $PROFIT
cat $SOURCEFILES | while read LINE; do
grep "${TAG2}" $TARGET/$LINE >> $PROFIT
done
}
#remove extraneous text characters from floating-point numbers
function format {
cat<<EOF
Formatting numerical data...
EOF
sed -i 's/,//' $REVENUE $PROFIT #strip commas
sed -i "s/${TAG1}//" $REVENUE #strip tags
sed -i "s/${TAG2}//" $PROFIT
sed -i "s/<\/td>//" $REVENUE $PROFIT
sed -i "s/N.A./0/" $PROFIT #strip n/a values
sed -i "s/[ \t]*//" $REVENUE $PROFIT #strip whitespace
sed -i '/^\s*$/d' $REVENUE $PROFIT #strip blank lines
}
#add the numerical data
function sumAll {
cat<<EOF
Calculating sums with downloaded data.
Verifying line count in files:
$(wc -l ${REVENUE})
$(wc -l ${PROFIT})
EOF
./sum.py
}
i=1
while [ $i -le $ARGC ]; do
#echo "Argv[$i] = ${!i}"
case ${!i} in
'-d') download
;;
'-c') clean
echo 'Finished.'
exit 0
;;
esac
i=$((i+1))
done
parse
extract
format
sumAll