-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiment_init.js
112 lines (92 loc) · 3.48 KB
/
experiment_init.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/**
* Initializes experiment
*/
/* process two arguments */
var path = require('path');
var args = process.argv.slice(2);
if (args.length != 2) {
console.error("\ncreates a new database and loads initial list of categories\n" +
"usage: node " + path.basename(process.argv[1]) + " <input_list.cats> <expertiment.sqlite3>\n" +
" where\n" +
" <input_list.cats> is a file with an initial list of pages and categories\n" +
" <expertiment.sqlite3> sqlite 3 database to be created\n") ;
process.exit(1);
}
var dbfile = args[1],
init_cats = args[0];
/* stop if database already exists */
var fs = require("fs");
if (fs.existsSync(dbfile)) {
console.error("database file '" + dbfile + "' already exists, exiting..");
process.exit(2);
}
/* create database tables */
var sqlite3 = require('sqlite3').verbose();
console.log("Creating database: " + dbfile);
var db = new sqlite3.Database(dbfile, sqlite3.OPEN_CREATE | sqlite3.OPEN_READWRITE, function(err) {
if (err) {
console.error('Failed: ' + err);
process.exit(2);
}
});
db.serialize(function() {
db.run("CREATE TABLE entries (" +
"entry TEXT NOT NULL PRIMARY KEY, " +
"pageid INT," + /* pageid <= 1 means that page does not exist */
"link_count INT DEFAULT 0, " + /* number of category links on the page or pages in the category */
"mentions INT DEFAULT 0, " + /* number of times the page or category was mentioned in page category lists */
"dist INT DEFAULT 0," + /* distance from the root of the graph (entries in the initial set are presumed to have 0 distance */
"rev_count INT, " + /* number of page edits */
"first_edit TEXT, " + /* date of the first edit */
"last_edit TEXT, " + /* date of the last edit */
"content TEXT, " + /* the content of the page in Wiki format */
"comment TEXT," + /* explanation of what we do with the entry */
"parsed BOOLEAN DEFAULT 0" + /* flag if the page was parsed or pages in the category has been downloaded */
")");
db.run("CREATE TABLE cat_src (" +
"entry TEXT, " +
"src_entry TEXT, " +
"PRIMARY KEY (entry, src_entry), " +
"FOREIGN KEY (entry) REFERENCES entries(entry)" +
"FOREIGN KEY (src_entry) REFERENCES cat_src(entry)" +
")");
});
/* load cats file into the database */
var stmt1 = db.prepare("INSERT INTO entries (entry, comment) VALUES (?, ?)");
var stmt2 = db.prepare("INSERT INTO cat_src (entry, src_entry) VALUES (?, '')");
var lineReader = require('line-reader');
var lines_read = 0, entries_inserted = 0;
var readline_complete = false;
function complete_loading() {
if (readline_complete && entries_inserted == lines_read) {
stmt1.finalize();
stmt2.finalize();
db.each("SELECT count(*) AS cnt FROM entries", function(err, row) {
console.log("Finished: read=" + lines_read + " inserted=" + entries_inserted + " table_rows=" + row.cnt);
});
db.close();
}
}
console.log("Reading category list...");
lineReader.eachLine(init_cats, function(line, last) {
//console.log(line);
/* skip comments and empty lines */
line = line.replace(/#.*$/, '');
if (line.match(/^\s*$/)) return;
var entry = line.split("\t")[0].trim();
var comment = '';
if (entry.match(/^-/)) {
comment = 'ignore';
entry = entry.substring(1);
}
lines_read++;
stmt1.run(entry, comment, function() {
++entries_inserted;
//if (comment != '')
stmt2.run(entry);
complete_loading();
});
}).then(function() {
readline_complete = true;
complete_loading();
});