forked from russoedu/LegisCrawler.br
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.js
54 lines (45 loc) · 1.32 KB
/
spider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env node
const http = require('http');
const cron = require('node-cron');
const Elapsy = require('elapsy');
const Spider = require('./crawl/Spider');
const SpiderStatus = require('./helpers/SpiderStatus');
const elapsy = new Elapsy();
const url = 'http://www4.planalto.gov.br/legislacao/portal-legis/legislacao-1';
// const url = 'http://www4.planalto.gov.br/legislacao/portal-legis/legislacao-1/codigos-1';
/**
* Define if the Spider should create a cron job or run now
* @property useSchedule
* @type {Boolean}
* @default true
*/
let useSchedule = true;
/**
* Define how many parallel requests and executions should run
* @property parallel
* @type {Number}
* @default 3
*/
global.parallel = 1;
// Read the CLI arguments
process.argv.forEach((arg, index) => {
if (arg === '--no-schedule') {
useSchedule = false;
} else if (arg === '--parallel') {
global.parallel = Number(process.argv[index + 1]);
}
});
// Set the number of parallel requests that should be opened
http.globalAgent.maxSockets = global.parallel;
process.env.UV_THREADPOOL_SIZE = 128;
process.setMaxListeners(0);
// Set the cron job if useSchedule was set
if (useSchedule) {
const hour = 4;
SpiderStatus.cronSet(hour);
cron.schedule(`0 0 ${hour} 1-31 * *`, () => {
Spider.crawlLinks(url);
});
} else {
Spider.crawlLinks(url);
}