forked from SheetJS/enron_xls
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse.mjs
executable file
·76 lines (64 loc) · 2.4 KB
/
parse.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env node
import { readFileSync, existsSync, writeFileSync, statSync } from 'fs';
import fetch from "node-fetch";
const { DOMParser } = await import('@xmldom/xmldom');
const tag = "edrm.enron.email.data.set.v2.xml";
const baseXML = `artifacts/${tag}_files.xml`;
const baseURL = `https://ia801309.us.archive.org`;
const EXT = process.env.EXT || "xls";
/*
| ext | count |
|:----|------:|
| wk4 | 1 |
| wk1 | 2 |
| tsv | 6 |
| xla | 30 |
| xlw | 124 |
| xls | 65564 |
*/
/* download a file if it does not exist */
async function grab(fn, url, size, subdir) {
if(subdir) fn = subdir + fn;
if(existsSync(fn)) return;
console.log(`GET ${url}${size ? " " + size : ""}`);
const f = await fetch(url);
const t = Buffer.from(await f.arrayBuffer());
writeFileSync(fn, t);
}
/* grab the archive.org manifest */
await grab(baseXML, `${baseURL}/4/items/${tag}/${tag}_files.xml`);
/* find the file entries */
const doc = new DOMParser().parseFromString(readFileSync(baseXML).toString());
const files = doc.getElementsByTagName("file");
/* look for ZIP files and process them */
(async() => { for(var i = 0; i < files.length; ++i) {
const file = files[i];
const name = file.getAttribute("name");
if(!name) continue;
if(process.argv[2] && process.argv[2] != name) continue;
if(file.getElementsByTagName("format")?.[0]?.textContent?.trim() == "ZIP") await zipped(name);
}})();
/* process a zip file (this part depends on the specific layout of the HTML listing) */
async function zipped(name) {
/* grab the HTML page (there is no XML entry like the main listing) */
const fn = `artifacts/ZZZ_${name}`;
await grab(fn, `${baseURL}/view_archive.php?archive=/4/items/${tag}/${name}`);
/* find the relevant links */
const html = readFileSync(fn).toString().split(/[\r\n]+/);
const rows = html.filter(t => t.indexOf(`.${EXT}</a>`) > -1);
/* process the html line */
for(let ri = 0; ri < rows.length; ++ri) {
const t = rows[ri];
const m = t.match(/a href="(.*?)"/);
if(!m) return;
const szm = t.match(/td id="size">(\d+)<\//);
/* try to download the file */
const f = m[1].split("/").slice(-1)[0];
await grab(f, (m[1].startsWith("//") ? "https:" : "") + m[1], szm?.[1], "files/");
/* if the file size is found, try to do the right thing */
try {
const sza = statSync("files/" + f)?.size;
if(szm?.[1] && sza != +szm[1]) console.log(`${f}: expected size ${szm[1]}, actual size ${sza}`)
} catch(e) {}
}
}