-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.js
181 lines (152 loc) · 5.1 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
'use strict';
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const url = require('url');
const logger = require('./logger');
/**
* @param {string} userAgent user agent
* @param {object} puppeteer puppeteer options
* @param {object} tbs extra options for TBS request parameter
*/
class GoogleScraper {
constructor({
userAgent = 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0',
scrollDelay = 500,
puppeteer = {},
tbs = {},
} = {}) {
this.userAgent = userAgent;
this.scrollDelay = scrollDelay;
this.puppeteerOptions = puppeteer;
this.tbs = this._parseRequestParameters(tbs);
}
_parseRequestParameters(tbs) {
if (tbs === undefined) {
return '';
}
const options = Object.keys(tbs)
.filter((key) => tbs[key])
.map((key) => `${key}:${tbs[key]}`)
.join(',');
return encodeURIComponent(options);
}
async scrape(searchQuery, limit = 100, domain = 'com') {
if (searchQuery === undefined || searchQuery === '') {
throw new Error('Invalid search query provided');
}
const query = `https://www.google.${domain}/search?q=${searchQuery}&source=lnms&tbm=isch&sa=X&tbs=${this.tbs}`;
// logger.info(`Start Google search for "${searchQuery}"`);
const browser = await puppeteer.launch({
...this.puppeteerOptions,
});
const page = await browser.newPage();
await page.setBypassCSP(true);
await page.goto(query, {
waitUntil: 'networkidle0',
});
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent(this.userAgent);
let results = [];
let previousCount = -1;
while (results.length < limit) {
await this._scrollToEnd(page);
await this._clickAllImages(page);
await page
.waitForSelector("#islrg a[href^='/imgres']", { timeout: 1000 }) // Wait for the selector to appear in page.
.catch(() => logger.debug('No results on this page')); // Unblock the flow
const html = await page.content();
const links = this._parseLinksFromHTML(html);
previousCount = results.length;
results = links.slice(0, limit);
if (previousCount === results.length) {
logger.debug('End of the page is reached');
break;
}
logger.debug(`Got ${results.length} results so far`);
}
await browser.close();
return results;
}
/**
* Scroll to the end of the page.
* @param {page} Puppeteer page to scroll
*/
async _scrollToEnd(page) {
logger.debug('Scrolling to the end of the page');
const isScrollable = await this._isScrollable(page);
if (!isScrollable) {
logger.debug('No results on this page');
return;
}
const buttonIsVisible = await this._isButtonVisible(page);
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
logger.debug(`Scrolled to bottom of the page`);
if (buttonIsVisible) {
await page.click("#islmp input[type='button']");
logger.debug('Clicked on show more results');
}
await page.waitFor(this.scrollDelay);
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
}
_isScrollable(page) {
return page.evaluate(() => {
return document.querySelector("#islmp input[type='button']") !== null;
});
}
_isButtonVisible(page) {
return page.evaluate(() => {
function isVisible(e) {
return !!(e.offsetWidth || e.offsetHeight || e.getClientRects().length);
}
return isVisible(document.querySelector("#islmp input[type='button']"));
});
}
_getInfiniteScrollStatus(page) {
return page.evaluate(() => {
let status = document.querySelector('#islmp div[data-endedmessage] > div:last-child')
.innerText;
return status;
});
}
async _clickAllImages(page) {
logger.debug('Scrolling to the end of the page');
return page.evaluate(() => {
let elements = document.querySelectorAll('#islrg img');
function rightClick(element) {
return new Promise((resolve) => {
let event = new MouseEvent('mousedown', {
bubbles: true,
cancelable: false,
view: window,
button: 2,
buttons: 2,
clientX: element.getBoundingClientRect().x,
clientY: element.getBoundingClientRect().y,
});
element.dispatchEvent(event);
resolve();
});
}
async function rightClickAll(elements) {
for (const element of elements) {
await rightClick(element);
}
}
rightClickAll(elements);
});
}
_parseLinksFromHTML(html) {
let links = [];
let $ = cheerio.load(html);
$("#islrg a[href^='/imgres']").each(function (i, elem) {
let description = $(this).next().find('div > div:first-child').text();
let link = $(this).attr('href');
let parsedLink = url.parse(link, { parseQueryString: true });
let imageurl = parsedLink.query.imgurl;
let source = parsedLink.query.imgrefurl;
links.push({ url: imageurl, source, description });
});
return links;
}
}
module.exports = GoogleScraper;