-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpage_analyzer.coffee
89 lines (73 loc) · 3.05 KB
/
page_analyzer.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
cheerio = require('cheerio')
URL = require('url')
urlNormalizer = require('./url_normalizer')()
module.exports = class PageAnalyzer
constructor: (url, depth, response, html, urlsToProcess, urlsInProgress, urlResults) ->
@url = url
@depth = depth
@response = response
@html = html
@urlsToProcess = urlsToProcess
@urlsInProgress = urlsInProgress
@urlResults = urlResults
process: (callback) ->
console.log "--> Processing HTML: #{@url}"
$ = cheerio.load(@html)
if @response.statusCode >= 400
# Bad URL. Do not proceed.
callback({})
else
# Ignore this page if we've hit our max depth.
# When depth == MAX_DEPTH, we only care about processing feeds.
if @depth < process.MAX_DEPTH
@checkForLinkTag $
@queueCommonFeedURLs()
@queuePotentialFeedLinks $
callback({})
# Checks for the RSS HTML tag, like many web browsers do.
# This is a VERY reliable signal that the link is a feed (may not be a good one, though).
checkForLinkTag: ($) ->
tags = $("link[rel='alternate'][type='application/rss+xml'], link[rel='alternate'][type='application/atom+xml']")
for tag in tags
@processURL(tag.attribs.href)
# Queues up feed URLs that are common with WordPress, etc.
queueCommonFeedURLs: ->
parsedURL = URL.parse @url
paths = [
"/rss",
"/feed",
"/blog"
]
for path in paths
@processURL("#{parsedURL.protocol}//#{parsedURL.host}#{path}")
# Analyzes the HTML and queues any links that might be either RSS feeds themselves
# or links to pages that may then link to RSS feeds.
queuePotentialFeedLinks: ($) ->
selectors = [
# Catches URLs ending in .xml and .rss, and potentially many others.
"a[href*='xml']",
"a[href*='rss']",
# Blog pages, etc are more likely to contain RSS feed links.
"a:contains('Blog')",
"a:contains('Feed')",
"a:contains('RSS')",
"a:contains('XML')"
# Catches all FeedBurner URLs in addition to pages containing links to feeds.
# I've seen FeedBurner use these domains:
# feeds.feedburner.com
# feeds2.feedburner.com
# feedproxy.google.com
"a[href*='feed']"
].join(', ')
# Insane limit: make sure we only queue up at most 75 links per page.
# It'd be nice if this could be lower, but a lot of sites have a whole page
# full of RSS feed links, so we don't want to exclude any of those.
$(selectors).filter((i, el) -> return i < 75).each (index, element) =>
@processURL(urlNormalizer.getNormalizedURL(@url, element.attribs.href))
processURL: (urlToProcess) ->
console.log "urls to process = #{Object.keys(@urlsToProcess).length}, in progress = #{Object.keys(@urlsInProgress).length}"
if @urlResults[urlToProcess]? or @urlsInProgress[urlToProcess]?
console.log "Skipped (already processed or in queue): #{urlToProcess}"
else
console.log("Queued: #{urlToProcess} with depth #{@depth + 1}.")
@urlsToProcess[urlToProcess] = { depth: @depth + 1 }