-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.js
56 lines (44 loc) · 1.34 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
const express = require("express");
const puppeteer = require("puppeteer");
const app = express();
const port = 3000;
app.get("/crawl", async (req, res, next) => {
const website = req.query.website;
if (!website) {
const err = new Error("Required query website missing");
err.status = 400;
next(err);
}
try {
const browser = await puppeteer.launch();
const registry = {};
let queue = [website];
while (queue.length > 0) {
const url = queue[queue.length - 1];
console.log("current url", url);
const page = await browser.newPage();
await page.goto(url);
registry[url] = await page.$eval("*", (el) => el.innerText);
queue.pop();
console.log("queue length", queue);
const hrefs = await page.$$eval("a", (anchorEls) =>
anchorEls.map((a) => a.href)
);
const filteredHrefs = hrefs.filter(
(href) => href.startsWith(website) && registry[href] === undefined
);
const uniqueHrefs = [...new Set(filteredHrefs)];
queue.push(...uniqueHrefs);
queue = [...new Set(queue)];
await page.close();
}
browser.close();
return res.status(200).send(registry);
} catch (e) {
console.log(e);
res.status(500).send("Something broke");
}
});
app.listen(port, () => {
console.log(`app is running on port: ${port}`);
});