-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlink_dispatcher.coffee
82 lines (69 loc) · 2.52 KB
/
link_dispatcher.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
request = require('request')
cheerio = require('cheerio')
URL = require('url')
FeedAnalyzer = require("./feed_analyzer")
PageAnalyzer = require("./page_analyzer")
module.exports = class LinkDispatcher
constructor: (urlsToProcess, urlsInProgress, urlResults) ->
@urlsToProcess = urlsToProcess
@urlsInProgress = urlsInProgress
@urlResults = urlResults
get: (url, depth, callback) ->
url = url.trim()
if url.length > 0 and depth <= process.MAX_DEPTH and not @blocked(url)
response = null
options = {
uri: url
followRedirect: true
headers: [
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36'
],
timeout: 30000
}
console.log "GET #{url} [page or feed]"
request(options, (err, response, data) =>
if err?
console.log "ERROR: #{err}"
callback({})
else
xml = @isXML(url, response, data)
if xml
new FeedAnalyzer(url, depth, response, data).process(callback)
else
new PageAnalyzer(url, depth, response, data, @urlsToProcess, @urlsInProgress, @urlResults).process(callback)
)
else
callback({})
# Guess whether this document is an RSS/Atom feed or not.
isXML: (url, res, data) ->
data = data.toLowerCase()
$ = cheerio.load(data, {xmlMode: true})
if res.headers['Content-Type'] is "text/xml" then yes
else if res.headers['Content-Type'] is "application/rss+xml" then yes
else if $("rss").length > 0 then yes
else if $("feed").length > 0 then yes
else if data? and 0 <= data.indexOf("<?xml") <= 10 and data.indexOf("doctype") is -1 and data.indexOf("xhtml") is -1 then yes
else if url.substring(url.length - 4) is ".xml" then yes
else if url.substring(url.length - 4) is ".rss" then yes
else if url.substring(url.length - 5) is ".atom" then yes
else no
# Prevent common false-positives.
blocked: (url) ->
parsedURL = URL.parse(url)
blockedDomains = [
"feedly.com",
"feedreader.com",
"icopyright.net",
"add.my.yahoo.com",
"fusion.google.com"
]
blockedKeywords = [
"comments",
"forum"
]
for domain in blockedDomains
if parsedURL.hostname? and parsedURL.hostname.toLowerCase().indexOf(domain) isnt -1
return true
for keyword in blockedKeywords
return true if url.toLowerCase().indexOf(keyword) isnt -1
false