-
Notifications
You must be signed in to change notification settings - Fork 3
/
setup.js
133 lines (112 loc) · 3.83 KB
/
setup.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
const fs = require("fs");
const path = require("path");
const FlexSearch = require("flexsearch");
const DataHandler = require("./src/core/data-handler");
const utils = require("./src/utils");
const axios = require("axios");
const Zip = require("adm-zip");
const DATA_URL = "https://zenodo.org/record/5094598/files/dataset.zip";
const DATA_PATH = "data/dataset.csv";
const DATA_ZIP_PATH = "data/dataset.zip";
const PACKAGE_DB_PATH = "data/packageDB.txt";
async function setupPackageDB(){
var packages = [];
const onData = (data) => {
if(data.keywords) data.keywords = JSON.parse(data.keywords);
else{
data.keywords = [];
}
data.snippets = JSON.parse(data.snippets);
packages.push(data);
}
await utils.readCSVStream(DATA_PATH, onData);
const encode = (str) => {
var words = DataHandler.keywords(str);
return words.join(" ");
}
var index = new FlexSearch("memory", {
tokenize: "strict", //opposed to splitting a word into f/fi/fil/file our search is non specific enough as is
doc: {
id: "id", //index by name for fast look up
field: ["description", "keywords"],
},
encode: encode,
});
var id = 0;
for(var p of packages){
var packageObject = {};
packageObject["id"] = id;
packageObject["description"] = p["description"];
packageObject["keywords"] = p["keywords"];
index.add(packageObject);
id++;
}
var database = index.export();
fs.writeFileSync(PACKAGE_DB_PATH, database, { encoding: "utf-8" });
console.log("Done");
}
async function download(url, path) {
if(!fs.existsSync("data")) fs.mkdirSync("data");
return new Promise((resolve, reject) => {
var fileStream = fs.createWriteStream(path);
fileStream
.on("error", (err) => {
console.log(err.message);
reject(err);
})
.on("finish", () => {
resolve();
});
axios
.get(url, {
responseType: "stream",
})
.then(function (response) {
var total = parseInt(response.data.headers["content-length"]);
var current = 0;
var nextPercent = 10;
response.data
.on("data", function (data) {
current += data.length;
var fraction = Math.round((current / total) * 100);
if (fraction >= nextPercent) {
console.log(nextPercent + "%");
nextPercent += 10;
}
})
.pipe(fileStream);
//pass end to filestream
response.data.on("end", function () {
fileStream.emit("end");
});
});
});
}
async function setupDatabase(){
var newDataset = false;
if(!fs.existsSync(DATA_PATH)){
newDataset = true;
console.log("Dataset missing. Dataset will be downloaded from " + DATA_URL)
await download(DATA_URL, DATA_ZIP_PATH);
//extract to data
console.log("Extracting from " + DATA_ZIP_PATH + " to " + path.dirname(DATA_PATH));
var zip = new Zip(DATA_ZIP_PATH);
zip.extractAllTo(path.dirname(DATA_PATH), true);
//delete
console.log("Deleting " + DATA_ZIP_PATH);
fs.unlinkSync(DATA_ZIP_PATH);
}
if(newDataset || !fs.existsSync(PACKAGE_DB_PATH)){
if(newDataset){
console.log("Updating package database.")
}
else{
console.log("No package database exists, creating.")
}
setupPackageDB();
}
}
async function main(){
setupDatabase();
}
main();