-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.coffee
117 lines (96 loc) · 3.37 KB
/
app.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
exec = require('child_process').exec
fs = require('fs')
LinkDispatcher = require('./link_dispatcher')
opts = require('nomnom').options({
open: {
abbr: 'o',
flag: true,
default: false,
help: 'Opens the CSV when done.'
},
'no-images': {
abbr: 'i',
flag: true,
help: 'Skips fetching images from feeds.'
},
concurrency: {
abbr: 'c',
default: 5,
help: 'Set the number of pages/feeds we load simultaneously.',
},
depth: {
abbr: 'd',
default: 3,
help: 'Set how deep of a chain (maximum) we follow before giving up.'
}
}).nom()
opts.images = true if !opts.images?
# Simultaneous request limit for pages and feeds.
# At most one image request will occur per feed at once.
SIMULTANEOUS_PAGE_REQUEST_LIMIT = opts.concurrency
# Limit the number of pages we'll pass through before stopping (on each path).
process.MAX_DEPTH = opts.depth
filename = process.argv[2]
csv = null
console.log "---------------------------------------------------------------"
console.log "Starting crawler with URLs from #{filename}..."
console.log "Max depth: #{process.MAX_DEPTH}, concurrency: #{SIMULTANEOUS_PAGE_REQUEST_LIMIT}, fetching images: #{opts.images}."
console.log "---------------------------------------------------------------"
if filename?
fs.readFile(filename, 'utf8', (err, data) ->
csv = null
urls = data.split('\n')
@urlsToProcess = {}
@urlsInProgress = {}
@urlResults = {}
for url in urls
@urlsToProcess[url] = { depth: 0 }
@dispatcher = new LinkDispatcher(@urlsToProcess, @urlsInProgress, @urlResults)
for i in [1..SIMULTANEOUS_PAGE_REQUEST_LIMIT]
processNextURL()
)
else
console.log "ERROR: please provide a file to read URLs from."
process.exit(2)
processNextURL = ->
urls = Object.keys(@urlsToProcess)
if urls.length > 0
url = urls[0]
@urlsInProgress[url] = { depth: @urlsToProcess[url].depth }
delete @urlsToProcess[url]
@dispatcher.get(url, @urlsInProgress[url].depth, (properties) =>
@urlResults[url] = properties
@urlResults[url].depth = @urlsInProgress[url].depth
delete @urlsInProgress[url]
processNextURL()
if Object.keys(@urlsToProcess).length is 0 and Object.keys(@urlsInProgress).length is 0 and not process.saving?
saveAsCSV()
)
saveAsCSV = ->
process.saving = true
for properties in Object.keys(@urlResults)
if Object.keys(@urlResults[properties]).length > 1
if !csv?
csv = Object.keys(@urlResults[properties]).join(',') + '\n'
csv += Object.keys(@urlResults[properties]).map((key) ->
if typeof @urlResults[properties][key] is "string"
@urlResults[properties][key].replace(/[,\n]/g, " ")
else
@urlResults[properties][key]
).join(',') + '\n'
file = "./results/rss_scrape_results_#{new Date().getTime()}.csv"
fs.writeFile(file, csv, (err) ->
if err?
# Write failed, but we've done all our processing so we'll output the CSV data to STDOUT
# (so we don't have to start over)
console.log err
console.log "Done, but couldn't save CSV. Here's the data we were trying to save:"
console.log csv
process.exit(1)
else
console.log "All done. The results were saved into #{file}."
if opts.open
console.log "Opening #{file}..."
exec("open #{file}")
process.exit(0)
)