forked from gaelgthomas/first-puppeteer-scraper-example
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
76 lines (64 loc) · 2.35 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
const puppeteer = require('puppeteer')
const fs = require('fs');
function extractItems() {
/* For extractedElements, you are selecting the tag and class,
that holds your desired information,
then choosing the disired child element you would like to scrape from.
*/
const extractedElements = document.querySelectorAll('.quote');
// Convert the quoteList to an iterable array
// For each quote fetch the text and author
return Array.from(extractedElements).map((quote) => {
// Get the sub-elements from the previously fetched quote element
const text = quote.querySelector(".text").innerText;
const author = quote.querySelector(".author").innerText;
return { text, author };
});
}
async function scrapeItems(
page,
extractItems,
itemCount,
scrollDelay = 800,
) {
let items = [];
try {
let previousHeight;
while (items.length < itemCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitForTimeout(scrollDelay);
}
} catch(e) { }
return items;
}
const getQuotes = async () => {
// Start a Puppeteer session with:
// - a visible browser (`headless: false` - easier to debug because you'll see the browser in action)
// - no default viewport (`defaultViewport: null` - website page will in full width and height)
const browser = await puppeteer.launch();
// Open a new page
const page = await browser.newPage();
// On this new page:
// - open the "http://quotes.toscrape.com/" website
// - wait until the dom content is loaded (HTML is ready)
await page.goto("http://quotes.toscrape.com/scroll", {
waitUntil: "domcontentloaded",
});
// Auto-scroll and extract desired items from the page. Currently set to extract ten items.
const items = await scrapeItems(page, extractItems, 100);
// Save extracted items to a new file.
// save courses to JSON file
fs.writeFile('quotes.json', JSON.stringify(items), (err) => {
if(err) throw err;
console.log('File saved');
});
// Display the quotes
console.log(items);
// Close the browser
await browser.close();
};
// Start the scraping
getQuotes();