diff --git a/api/v2/lib/dd2queries.js b/api/v2/lib/dd2queries.js index c6508a9..27cca80 100644 --- a/api/v2/lib/dd2queries.js +++ b/api/v2/lib/dd2queries.js @@ -92,40 +92,44 @@ const dd2queries = function(queryObject) { return q; }; -// We go through the data dictionary and create a data structure that -// groups various keys of a resource field by the related query string -// param submitted via a REST API. There is another grouping that helps -// doing a quick look up of the resourceID key for a given resource, -// for example, 'treatmentId' for 'treatments'. This is how ddKeys looks -// -// "byQueryString": { -// "treatments": { -// "treatmentId": { -// "sqlName": "treatmentId", -// "queryable": "equal", -// "table": false, -// "resourceId": "treatmentId" -// }, -// "treatmentTitle": { -// "sqlName": "treatmentTitle", -// "queryable": "like", -// "table": false, -// "resourceId": false -// }, -// … other fields … -// }, -// … other resources … -// }, -// "byResourceIds": { -// "treatments": "treatmentId", -// "figureCitations": "figureCitationId", -// "bibRefCitations": "bibRefCitationId", -// "treatmentCitations": "treatmentCitationId", -// "materialsCitations": "materialsCitationId", -// "treatmentAuthors": "treatmentAuthorId", -// "images": "id", -// "publications": "id" -// } +/************************************************************************* + + We go through the data dictionary and create a data structure that + groups various keys of a resource field by the related query string + param submitted via a REST API. There is another grouping that helps + doing a quick look up of the resourceID key for a given resource, + for example, 'treatmentId' for 'treatments'. This is how ddKeys looks + + "byQueryString": { + "treatments": { + "treatmentId": { + "sqlName": "treatmentId", + "queryable": "equal", + "table": false, + "resourceId": "treatmentId" + }, + "treatmentTitle": { + "sqlName": "treatmentTitle", + "queryable": "like", + "table": false, + "resourceId": false + }, + … other fields … + }, + … other resources … + }, + "byResourceIds": { + "treatments": "treatmentId", + "figureCitations": "figureCitationId", + "bibRefCitations": "bibRefCitationId", + "treatmentCitations": "treatmentCitationId", + "materialsCitations": "materialsCitationId", + "treatmentAuthors": "treatmentAuthorId", + "images": "id", + "publications": "id" + } + +*************************************************************************/ const getDdKeys = function() { const byQueryString = {}; @@ -227,18 +231,22 @@ const calcConstraint = function(ddKeys, queryObject) { } else if (op === 'between') { - // else if (col === 'lat') { - // cols.push('latitude > @min_latitude'); - // cols.push('latitude < @max_latitude'); - // queryObject.min_latitude = queryObject.lat - range; - // queryObject.max_latitude = +queryObject.lat + range; - // } - // else if (col === 'lon') { - // cols.push('longitude > @min_longitude'); - // cols.push('longitude < @max_longitude'); - // queryObject.min_longitude = queryObject.lon - range; - // queryObject.max_longitude = +queryObject.lon + range; - // } + /*********************************************************************** + + else if (col === 'lat') { + cols.push('latitude > @min_latitude'); + cols.push('latitude < @max_latitude'); + queryObject.min_latitude = queryObject.lat - range; + queryObject.max_latitude = +queryObject.lat + range; + } + else if (col === 'lon') { + cols.push('longitude > @min_longitude'); + cols.push('longitude < @max_longitude'); + queryObject.min_longitude = queryObject.lon - range; + queryObject.max_longitude = +queryObject.lon + range; + } + + ***********************************************************************/ const delta = 0.9; queryObject[`min_${k}`] = +queryObject[k] - delta; @@ -253,20 +261,22 @@ const calcConstraint = function(ddKeys, queryObject) { return [ matchTables, constraint ]; }; -// A query is made up of seven parts -// -// three mandatory parts -// ----------------------------- -// SELECT -// FROM -// WHERE -// -// four optional parts -// ----------------------------- -// GROUP BY -// ORDER BY -// LIMIT -// OFFSET +/*********************************************************************** + + A query is made up of seven parts + three mandatory parts + ----------------------------- + SELECT + FROM + WHERE + four optional parts + ----------------------------- + GROUP BY + ORDER BY + LIMIT + OFFSET + +***********************************************************************/ const calcQuery = function(ddKeys, queryGroup, query, queryObject, matchTables, additionalConstraint) { //console.log(additionalConstraint) @@ -334,12 +344,12 @@ const test = function() { // page: 1, // size: 30, resource: 'treatments', - author: 'fisher', + // author: 'fisher', // facets: false, // stats: false, // xml: false, // sortBy: 'journalYear:ASC', - // q: 'carabus', + q: 'Rhinolophus sinicus', // authorityName: 'Agosti', // journalYear: '1996', // format: 'xml', diff --git a/api/v2/lib/qparts.js b/api/v2/lib/qparts.js index 86e8c85..9d21b55 100644 --- a/api/v2/lib/qparts.js +++ b/api/v2/lib/qparts.js @@ -68,7 +68,7 @@ const queryParts = { constraint: ['treatments.deleted = 0'], sortBy: { columns: [ - 'journalYear', + 'journalYear', 'treatmentTitle' ], defaultSort: { diff --git a/bin/truebug.js b/bin/truebug.js index 7554799..8663ab0 100644 --- a/bin/truebug.js +++ b/bin/truebug.js @@ -1,17 +1,28 @@ 'use strict'; +const config = require('config'); +const Utils = require('../api/v2/utils'); const opts = require('./truebug/opts') -//const download = require('./truebug/download'); +const download = require('./truebug/download'); const parse = require('./truebug/parse'); const database = require('./truebug/database'); +const plog = require(config.get('plog')); + +let timer = process.hrtime(); // download new files zip archive if (opts.download) { - download(); + download(opts.download); } if (opts.database) { - database.createTables(); + + // tables will get created if they don't already exist + database.createTablesStatic(); + + // insert statements will be prepared and stored to run + // as transactions + database.createInsertStatements(); } if (opts.parse) { @@ -19,5 +30,16 @@ if (opts.parse) { } if (opts.database) { - database.indexTables(); -} \ No newline at end of file + database.indexTablesStatic(); +} + +timer = process.hrtime(timer); +plog.logger({ + host: 'localhost', + start: 'start', + end: 'end', + status: 200, + resource: 'parse', + query: `parsed`, + message: Utils.timerFormat(timer) +}); \ No newline at end of file diff --git a/bin/truebug/database.js b/bin/truebug/database.js index bea890f..001ef4e 100644 --- a/bin/truebug/database.js +++ b/bin/truebug/database.js @@ -3,7 +3,8 @@ const Database = require('better-sqlite3'); const config = require('config'); const dataDict = require(config.get('v2.dataDict')); -const db = new Database(config.get('data.treatments')); +const dataDictionary = dataDict.dataDictionary; +const db = new Database(config.get('data.treatmentsTmp')); const debug = false; const deBugger = function(options) { @@ -18,9 +19,11 @@ const deBugger = function(options) { if (options.debug) { if (type === 'insert') { if (values.length) { - let istmt = database.insertStmts[table]; - values.forEach(v => {istmt = istmt.replace(/\?/, `'${v}'`)}); - console.log(istmt); + let istmt = database.insertStatic[table]; + //values.forEach(v => {istmt = istmt.replace(/\?/, `'${v}'`)}); + //console.log(values) + //console.log('done') + //console.log(istmt); } } else { @@ -28,17 +31,33 @@ const deBugger = function(options) { } } else { - if (type === 'createInsert') { + if (type === 'createInsertStatements') { + console.log(`- creating insert statement ${table}`); database.insertStmts[table] = db.prepare(stmt); } else if (type === 'insert') { + //database.insertStmts[table].run(values); + // console.log(table) + // console.log('----------------------------------') + // console.log(values) + // console.log('==================================\n') + //console.log(database.insertStatic[table]) database.insertStmts[table].run(values); } else if (type === 'create') { + console.log(`- creating table ${table}`); db.prepare(stmt).run(); } else if (type === 'index') { - db.prepare(stmt).run(); + try { + console.log(`- creating index ${table}`); + db.prepare(stmt).run(); + } + catch(error) { + console.log(`… skipping index ${table} (already exists)`); + } + + } } }; @@ -47,20 +66,39 @@ const database = { createTables: function() { - for (let table in dataDict) { + for (let table in dataDictionary) { let cols = []; let colsWithTypes = []; let colsForBinding = []; - dataDict[table].forEach(f => { - cols.push( f.plazi ); - colsWithTypes.push( f.plazi + ' ' + f.type ); - colsForBinding.push( '?' ); + dataDictionary[table].forEach(f => { + + if (f.sqlType) { + + if ( f.plaziName === 'q') { + cols.push( 'fulltext' ); + } + else if ( f.plaziName === 'order' ) { + cols.push( '"order"' ); + } + else { + cols.push( f.plaziName ); + } + + colsWithTypes.push( f.plaziName + ' ' + f.sqlType ); + colsForBinding.push( '?' ); + } + }); + cols.push( 'inserted' ); + colsWithTypes.push( "inserted INTEGER DEFAULT (strftime('%s','now'))" ); + colsForBinding.push( '?' ); + // add a primary key to all the tables - colsWithTypes.unshift('id INTEGER PRIMARY KEY'); + // not needed with the new datadictionary + //colsWithTypes.unshift('id INTEGER PRIMARY KEY'); // for making the UNIQUE indexes let colUniq = []; @@ -147,95 +185,580 @@ const database = { deBugger({debug: debug, type: 'create', stmt: createViewActiveTreatments, table: 'activeTreatments', values: []}); }, + createTablesStatic: function() { + + const tables = { + treatments: `CREATE TABLE IF NOT EXISTS treatments ( + id INTEGER PRIMARY KEY, + treatmentId TEXT NOT NULL UNIQUE, + treatmentTitle TEXT, + doi TEXT, + zenodoDep TEXT, + zoobank TEXT, + articleTitle TEXT, + publicationDate TEXT, + journalTitle TEXT, + journalYear TEXT, + journalVolume TEXT, + journalIssue TEXT, + pages TEXT, + authorityName TEXT, + authorityYear TEXT, + kingdom TEXT, + phylum TEXT, + "order" TEXT, + family TEXT, + genus TEXT, + species TEXT, + status TEXT, + taxonomicNameLabel TEXT, + rank TEXT, + q TEXT, + author TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER +)`, + + treatmentAuthors: `CREATE TABLE IF NOT EXISTS treatmentAuthors ( + id INTEGER PRIMARY KEY, + treatmentAuthorId TEXT NOT NULL, + treatmentId TEXT NOT NULL, + treatmentAuthor TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER, + UNIQUE (treatmentAuthorId, treatmentId) +)`, + + materialsCitations: `CREATE TABLE IF NOT EXISTS materialsCitations ( + id INTEGER PRIMARY KEY, + materialsCitationId TEXT NOT NULL, + treatmentId TEXT NOT NULL, + collectingDate TEXT, + collectionCode TEXT, + collectorName TEXT, + country TEXT, + collectingRegion TEXT, + municipality TEXT, + county TEXT, + stateProvince TEXT, + location TEXT, + locationDeviation TEXT, + specimenCountFemale TEXT, + specimenCountMale TEXT, + specimenCount TEXT, + specimenCode TEXT, + typeStatus TEXT, + determinerName TEXT, + collectedFrom TEXT, + collectingMethod TEXT, + latitude REAL, + longitude REAL, + elevation REAL, + httpUri TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER, + UNIQUE (materialsCitationId, treatmentId) +)`, + + treatmentCitations: `CREATE TABLE IF NOT EXISTS treatmentCitations ( + id INTEGER PRIMARY KEY, + treatmentCitationId TEXT NOT NULL, + treatmentId TEXT NOT NULL, + treatmentCitation TEXT, + refString TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER, + UNIQUE (treatmentCitationId, treatmentId) +)`, + + figureCitations: `CREATE TABLE IF NOT EXISTS figureCitations ( + id INTEGER PRIMARY KEY, + figureCitationId TEXT NOT NULL, + treatmentId TEXT NOT NULL, + captionText TEXT, + httpUri TEXT, + thumbnailUri TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER, + UNIQUE (figureCitationId, treatmentId) +)`, + + bibRefCitations: `CREATE TABLE IF NOT EXISTS bibRefCitations ( + id INTEGER PRIMARY KEY, + bibRefCitationId TEXT NOT NULL, + treatmentId TEXT NOT NULL, + refString TEXT, + type TEXT, + year TEXT, + deleted INTEGER DEFAULT 0, + created INTEGER DEFAULT (strftime('%s','now')), + updated INTEGER, + UNIQUE (bibRefCitationId, treatmentId) +)`, + + vtreatments: `CREATE VIRTUAL TABLE IF NOT EXISTS vtreatments USING FTS5(treatmentId, fullText)`, + vfigurecitations: `CREATE VIRTUAL TABLE IF NOT EXISTS vfigurecitations USING FTS5(figureCitationId, captionText)`, + vbibrefcitations: `CREATE VIRTUAL TABLE IF NOT EXISTS vbibrefcitations USING FTS5(bibRefCitationId, refString)` + }; + + for (let t in tables) { + deBugger({ + debug: debug, + type: 'create', + stmt: tables[t], + table: t, + values: [] + }); + } + }, + // store the insert statements for later use insertStmts: {}, + + createInsertStatements: function() { + + const updateTime = Math.floor(new Date().getTime() / 1000); + + const insertStatements = { + treatments: `INSERT INTO treatments ( + treatmentId, + treatmentTitle, + doi, + zenodoDep, + zoobank, + articleTitle, + publicationDate, + journalTitle, + journalYear, + journalVolume, + journalIssue, + pages, + authorityName, + authorityYear, + kingdom, + phylum, + "order", + family, + genus, + species, + status, + taxonomicNameLabel, + rank, + q, + deleted + ) + VALUES ( + @treatmentId, + @treatmentTitle, + @doi, + @zenodoDep, + @zoobank, + @articleTitle, + @publicationDate, + @journalTitle, + @journalYear, + @journalVolume, + @journalIssue, + @pages, + @authorityName, + @authorityYear, + @kingdom, + @phylum, + @order, + @family, + @genus, + @species, + @status, + @taxonomicNameLabel, + @rank, + @q, + @deleted + ) + ON CONFLICT (treatmentId) + DO UPDATE SET + treatmentTitle=excluded.treatmentTitle, + doi=excluded.doi, + zenodoDep=excluded.zenodoDep, + zoobank=excluded.zoobank, + articleTitle=excluded.articleTitle, + publicationDate=excluded.publicationDate, + journalTitle=excluded.journalTitle, + journalYear=excluded.journalYear, + journalVolume=excluded.journalVolume, + journalIssue=excluded.journalIssue, + pages=excluded.pages, + authorityName=excluded.authorityName, + authorityYear=excluded.authorityYear, + kingdom=excluded.kingdom, + phylum=excluded.phylum, + "order"=excluded."order", + family=excluded.family, + genus=excluded.genus, + species=excluded.species, + status=excluded.status, + taxonomicNameLabel=excluded.taxonomicNameLabel, + rank=excluded.rank, + q=excluded.q, + author=excluded.author, + deleted=excluded.deleted, + updated=${updateTime}`, + + treatmentAuthors: `INSERT INTO treatmentAuthors ( + treatmentAuthorId, + treatmentId, + treatmentAuthor, + deleted + ) + VALUES ( + @treatmentAuthorId, + @treatmentId, + @treatmentAuthor, + @deleted + ) + ON CONFLICT (treatmentAuthorId, treatmentId) + DO UPDATE SET + treatmentId=excluded.treatmentId, + treatmentAuthor=excluded.treatmentAuthor, + deleted=excluded.deleted, + updated=${updateTime}`, - loadData: function(data) { + materialsCitations: `INSERT INTO materialsCitations ( + materialsCitationId, + treatmentId, + collectingDate, + collectionCode, + collectorName, + country, + collectingRegion, + municipality, + county, + stateProvince, + location, + locationDeviation, + specimenCountFemale, + specimenCountMale, + specimenCount, + specimenCode, + typeStatus, + determinerName, + collectedFrom, + collectingMethod, + latitude, + longitude, + elevation, + httpUri, + deleted + ) + VALUES ( + @materialsCitationId, + @treatmentId, + @collectingDate, + @collectionCode, + @collectorName, + @country, + @collectingRegion, + @municipality, + @county, + @stateProvince, + @location, + @locationDeviation, + @specimenCountFemale, + @specimenCountMale, + @specimenCount, + @specimenCode, + @typeStatus, + @determinerName, + @collectedFrom, + @collectingMethod, + @latitude, + @longitude, + @elevation, + @httpUri, + @deleted + ) + ON CONFLICT (materialsCitationId, treatmentId) + DO UPDATE SET + treatmentId=excluded.treatmentId, + collectingDate=excluded.collectingDate, + collectionCode=excluded.collectionCode, + collectorName=excluded.collectorName, + country=excluded.country, + collectingRegion=excluded.collectingRegion, + municipality=excluded.municipality, + county=excluded.county, + stateProvince=excluded.stateProvince, + location=excluded.location, + locationDeviation=excluded.locationDeviation, + specimenCountFemale=excluded.specimenCountFemale, + specimenCountMale=excluded.specimenCountMale, + specimenCount=excluded.specimenCount, + specimenCode=excluded.specimenCode, + typeStatus=excluded.typeStatus, + determinerName=excluded.determinerName, + collectedFrom=excluded.collectedFrom, + collectingMethod=excluded.collectingMethod, + latitude=excluded.latitude, + longitude=excluded.longitude, + elevation=excluded.elevation, + httpUri=excluded.httpUri, + deleted=excluded.deleted, + updated=${updateTime}`, + + treatmentCitations: `INSERT INTO treatmentCitations ( + treatmentCitationId, + treatmentId, + treatmentCitation, + refString, + deleted + ) + VALUES ( + @treatmentCitationId, + @treatmentId, + @treatmentCitation, + @refString, + @deleted + ) + ON CONFLICT (treatmentCitationId, treatmentId) + DO UPDATE SET + treatmentId=excluded.treatmentId, + treatmentCitation=excluded.treatmentCitation, + refString=excluded.refString, + deleted=excluded.deleted, + updated=${updateTime}`, + + //thumbnailUri, + figureCitations: `INSERT INTO figureCitations ( + figureCitationId, + treatmentId, + captionText, + httpUri, + + deleted + ) + VALUES ( + @figureCitationId, + @treatmentId, + @captionText, + @httpUri, + + @deleted + ) + ON CONFLICT (figureCitationId, treatmentId) + DO UPDATE SET + treatmentId=excluded.treatmentId, + captionText=excluded.captionText, + httpUri=excluded.httpUri, + + deleted=excluded.deleted, + updated=${updateTime}`, + + bibRefCitations: `INSERT INTO bibRefCitations ( + bibRefCitationId, + treatmentId, + refString, + type, + year, + deleted + ) + VALUES ( + @bibRefCitationId, + @treatmentId, + @refString, + @type, + @year, + @deleted + ) + ON CONFLICT (bibRefCitationId, treatmentId) + DO UPDATE SET + treatmentId=excluded.treatmentId, + refString=excluded.refString, + type=excluded.type, + year=excluded.year, + deleted=excluded.deleted, + updated=${updateTime}`, + + vtreatments: 'INSERT INTO vtreatments SELECT treatmentId, q FROM treatments WHERE deleted = 0', - // The data structure submitted to `loadData()` looks as follows - // - // data = [ - // { - // treatment: { }, - // treatmentAuthors: [ [{}, {} … ] ], - // materialCitations: [ [{}, {} … ] ], - // treatmentCitations: [ [{}, {} … ] ], - // figureCitations: [ [{}, {} … ] ], - // bibRefCitations: [ [{}, {} … ] ] - // } - // ] - // - // We need to convert this hierarchical array of treatments into - // a separate array for each part of the treatment so they can be - // inserted into the separate SQL tables. However, we also have - // add an extra 'treatmentId' key to all the componoents of a - // treatment so they can be linked together in a SQL JOIN query. - // So the above data structure will be converted to the following - // - // d = { - // treatments: [], - // treatmentAuthors: [], - // materialsCitations: [], - // treatmentCitations: [], - // figureCitations: [], - // bibRefCitations: [] - // } + vfigurecitations: 'INSERT INTO vfigurecitations SELECT figureCitationId, captionText FROM figureCitations WHERE deleted = 0', - for (let table in dataDict) { + vbibrefcitations: 'INSERT INTO vbibrefcitations SELECT bibRefCitationId, refString FROM bibRefCitations WHERE deleted = 0' + }; - let d = { - treatments: [], - treatmentAuthors: [], - materialsCitations: [], - treatmentCitations: [], - figureCitations: [], - bibRefCitations: [] - } + for (let table in insertStatements) { + deBugger({ + debug: debug, + type: 'createInsertStatements', + stmt: insertStatements[ table ], + table: table, + values: [] + }); + } - for (let i = 0, j = data.length; i < j; i++) { + + }, + + loadData: function(data) { - if (table === 'treatments') { - d.treatments.push(data[i].treatment) + /*************************************************************************** + * + * The data structure submitted to `loadData()` looks as follows + * + * data = [ + * + * // treatment 1 and its related data + * { + * treatment: { }, + * treatmentAuthors: [ {}, {} … ], + * materialCitations: [ {}, {} … ], + * treatmentCitations: [ {}, {} … ], + * figureCitations: [ {}, {} … ], + * bibRefCitations: [ {}, {} … ] + * }, + * + * // treatment 2 and its related data + * { + * treatment: { }, + * treatmentAuthors: [ {}, {} … ], + * materialCitations: [ {}, {} … ], + * treatmentCitations: [ {}, {} … ], + * figureCitations: [ {}, {} … ], + * bibRefCitations: [ {}, {} … ] + * } + * ] + * + * We need to convert this hierarchical array of treatments into + * a separate array for each part of the treatment so they can be + * inserted into the separate SQL tables. However, we also have + * add an extra 'treatmentId' key to all the componoents of a + * treatment so they can be linked together in a SQL JOIN query. + * So the above data structure will be converted to the following + * + * d = { + * treatments: [ {}, {} … ], + * treatmentAuthors: [ {}, {} … ], + * materialsCitations: [ {}, {} … ], + * treatmentCitations: [ {}, {} … ], + * figureCitations: [ {}, {} … ], + * bibRefCitations: [ {}, {} … ] + * } + * + ***************************************************************************/ + + const d = { + treatments: [], + // treatmentAuthors: [], + // materialsCitations: [], + // treatmentCitations: [], + // figureCitations: [], + // bibRefCitations: [] + }; + + + for (let i = 0, j = data.length; i < j; i++) { + + const t = data[i]; + + for (let table in t) { + + + if (table === 'treatment') { + d.treatments.push( t[ table ] ); } else { + //d[ table ].push( t[ table ] ); + d[ table ] = t[ table ]; + } + } + } - // While processing different parts of a 'treatment' - // such as 'treatmentCitations', 'materialsCitation' - // 'treatmentAuthors', 'figureCitations' and - // 'bibrefCitations' we have to check whether not the - // array exists. For example, if no 'treatmentAuthors' - // were found for a specific treatment, for that - // 'treatment' the 'treatmentAuthors' array will be - // undefined. In that case we don't process it because - // there is nothing to insert into the database. - if (typeof(data[i][table]) !== 'undefined') { - - // for each component of the 'treatment', we take each - // element of the array, ultimately a new row in the - // database, and insert it into a separate array. - for (let r in data[i][table]) { - d[table].push(data[i][table][r]) - } - + for (let table in d) { + + if (d[ table ].length) { + + const insertMany = db.transaction((rows) => { + for (const row of rows) { + deBugger({ + debug: debug, + type: 'insert', + stmt: '', + table: table, + values: row + }); } - - } + }); + + insertMany(d[ table ]); } + } - const insertMany = db.transaction((rows) => { - for (const row of rows) { - const r = Object.values(row); - deBugger({debug: debug, type: 'insert', stmt: '', table: table, values: r}); - } - }); + // for (let table in dataDictionary) { + + // let d = { + // treatments: [], + // treatmentAuthors: [], + // materialsCitations: [], + // treatmentCitations: [], + // figureCitations: [], + // bibRefCitations: [] + // } + + // for (let i = 0, j = data.length; i < j; i++) { + + // if (table === 'treatments') { + // d.treatments.push(data[i].treatment); + // } + // else { + + // /**************************************************************************** + // * + // * While processing different parts of a 'treatment' + // * such as 'treatmentCitations', 'materialsCitation' + // * 'treatmentAuthors', 'figureCitations' and + // * 'bibrefCitations' we have to check whether not the + // * array exists. For example, if no 'treatmentAuthors' + // * were found for a specific treatment, for that + // * 'treatment' the 'treatmentAuthors' array will be + // * undefined. In that case we don't process it because + // * there is nothing to insert into the database. + // * + // ****************************************************************************/ + // if (typeof(data[i][table]) !== 'undefined') { + + // /**************************************************************************** + // * + // * for each component of the 'treatment', we take each + // * element of the array, ultimately a new row in the + // * database, and insert it into a separate array. + // * + // ****************************************************************************/ + // for (let r in data[i][table]) { + // d[table].push(data[i][table][r]) + // } + + // } + + // } + // } + + // const insertMany = db.transaction((rows) => { + // for (const row of rows) { + // const r = Object.values(row); + // deBugger({debug: debug, type: 'insert', stmt: '', table: table, values: r}); + // } + // }); - for (let t in d) { - if (d[t].length) { - insertMany(d[t]); - } - } + // for (let t in d) { + // if (d[t].length) { + // insertMany(d[t]); + // } + // } - } + // } }, indexTables: function() { @@ -259,20 +782,26 @@ const database = { // }); // index treatents table on each queryable field - for (let t in dataDict) { + for (let table in dataDictionary) { - const table = dataDict[t]; - let i = 0, j = table.length; + const columns = dataDictionary[ table ]; + let i = 0, j = columns.length; for (; i < j; i++) { - const col = table[i]; - let colname = col.plazi; - let colName = colname.replace(/"/g, ''); + const column = columns[i]; + const colname = column.plaziName.replace(/"/g, ''); + //let colName = colname.replace(/"/g, ''); - if (col.queryable) { + if (column.queryable) { //bar.tick(1); - const indexStmt = `CREATE INDEX IF NOT EXISTS ix_${t}_${colName} ON ${t} (${colname}) WHERE deleted = 0`; - deBugger({debug: debug, type: 'index', stmt: indexStmt, table: t, values: []}); + const indexStmt = `CREATE INDEX IF NOT EXISTS ix_${t}_${colname} ON ${table} (${colname}) WHERE deleted = 0`; + deBugger({ + debug: debug, + type: 'index', + stmt: indexStmt, + table: table, + values: [] + }); } } @@ -282,14 +811,104 @@ const database = { const i = cols.indexOf('"order"'); let name = cols.join('_').replace(/"/g, ''); const ixStmt = `CREATE INDEX IF NOT EXISTS ix_treatments_${name} ON treatments (${cols.join(', ')}) WHERE deleted = 0`; - deBugger({debug: debug, type: 'index', stmt: ixStmt, table: 'treatments', values: []}); + deBugger({ + debug: debug, + type: 'index', + stmt: ixStmt, + table: 'treatments', + values: [] + }); }); let facets = config.get('v2.facets'); facets.unshift('treatmentId'); //CREATE INDEX IF NOT EXISTS ix_treatments_facets ON treatments (deleted, treatmentId, journalTitle, journalYear, kingdom, phylum, "order", family, genus, species, status, rank) WHERE deleted = 0 const ixStmt = `CREATE INDEX IF NOT EXISTS ix_treatments_facets ON treatments (deleted, ${facets.join(', ')}) WHERE deleted = 0`; - deBugger({debug: debug, type: 'index', stmt: ixStmt, table: 'treatments', values: []}); + deBugger({ + debug: debug, + type: 'index', + stmt: ixStmt, + table: 'treatments', + values: [] + }); + }, + + indexTablesStatic: function() { + + const indexes = { + ix_treatmentCitations_treatmentCitation : 'CREATE INDEX ix_treatmentCitations_treatmentCitation ON treatmentCitations (deleted, Lower(treatmentCitation)) WHERE deleted = 0', + ix_treatmentCitations_refString : 'CREATE INDEX ix_treatmentCitations_refString ON treatmentCitations (deleted, Lower(refString)) WHERE deleted = 0', + ix_bibRefCitations_year : 'CREATE INDEX ix_bibRefCitations_year ON bibRefCitations (deleted, year) WHERE deleted = 0', + ix_treatments_treatmentId : 'CREATE INDEX ix_treatments_treatmentId ON treatments (deleted, treatmentId)', + ix_treatments_treatmentTitle : 'CREATE INDEX ix_treatments_treatmentTitle ON treatments (deleted, treatmentTitle COLLATE NOCASE)', + ix_treatments_articleTitle : 'CREATE INDEX ix_treatments_articleTitle ON treatments (deleted, articleTitle COLLATE NOCASE)', + ix_treatments_publicationDate : 'CREATE INDEX ix_treatments_publicationDate ON treatments (deleted, publicationDate)', + ix_treatments_journalTitle : 'CREATE INDEX ix_treatments_journalTitle ON treatments (deleted, journalTitle COLLATE NOCASE)', + ix_treatments_journalYear : 'CREATE INDEX ix_treatments_journalYear ON treatments (deleted, journalYear)', + ix_treatments_authorityName : 'CREATE INDEX ix_treatments_authorityName ON treatments (deleted, authorityName COLLATE NOCASE)', + ix_treatments_taxonomicNameLabel : 'CREATE INDEX ix_treatments_taxonomicNameLabel ON treatments (deleted, taxonomicNameLabel COLLATE NOCASE)', + ix_treatments_kingdom : 'CREATE INDEX ix_treatments_kingdom ON treatments (deleted, kingdom COLLATE NOCASE)', + ix_treatments_phylum : 'CREATE INDEX ix_treatments_phylum ON treatments (deleted, phylum COLLATE NOCASE)', + ix_treatments_order : 'CREATE INDEX ix_treatments_order ON treatments (deleted, "order" COLLATE NOCASE)', + ix_treatments_family : 'CREATE INDEX ix_treatments_family ON treatments (deleted, family COLLATE NOCASE)', + ix_treatments_genus : 'CREATE INDEX ix_treatments_genus ON treatments (deleted, genus COLLATE NOCASE)', + ix_treatments_species : 'CREATE INDEX ix_treatments_species ON treatments (deleted, species COLLATE NOCASE)', + ix_treatments_status : 'CREATE INDEX ix_treatments_status ON treatments (deleted, status COLLATE NOCASE)', + ix_treatments_rank : 'CREATE INDEX ix_treatments_rank ON treatments (deleted, rank COLLATE NOCASE)', + ix_treatments_k_phylum : 'CREATE INDEX ix_treatments_k_phylum ON treatments (deleted, kingdom, phylum)', + ix_treatments_k_p_order : 'CREATE INDEX ix_treatments_k_p_order ON treatments (deleted, kingdom, phylum, "order")', + ix_treatments_k_p_o_family : 'CREATE INDEX ix_treatments_k_p_o_family ON treatments (deleted, kingdom, phylum, "order", family)', + ix_treatments_k_p_o_f_genus : 'CREATE INDEX ix_treatments_k_p_o_f_genus ON treatments (deleted, kingdom, phylum, "order", family, genus)', + ix_treatments_k_p_o_f_g_species : 'CREATE INDEX ix_treatments_k_p_o_f_g_species ON treatments (deleted, kingdom, phylum, "order", family, genus, species)', + ix_treatments_facets : 'CREATE INDEX ix_treatments_facets ON treatments (deleted, treatmentId, journalTitle, journalYear, kingdom, phylum, "order", family, genus, species, status, rank)', + ix_treatments_deleted : 'CREATE INDEX ix_treatments_deleted ON treatments (deleted)', + ix_treatmentAuthors_treatmentAuthorId : 'CREATE INDEX ix_treatmentAuthors_treatmentAuthorId ON treatmentAuthors (deleted, treatmentAuthorId)', + ix_treatmentAuthors_treatmentId : 'CREATE INDEX ix_treatmentAuthors_treatmentId ON treatmentAuthors (deleted, treatmentId)', + ix_treatmentAuthors_treatmentAuthor : 'CREATE INDEX ix_treatmentAuthors_treatmentAuthor ON treatmentAuthors (deleted, treatmentAuthor COLLATE NOCASE)', + ix_treatmentAuthors_deleted : 'CREATE INDEX ix_treatmentAuthors_deleted ON treatmentAuthors (deleted)', + ix_materialsCitations_materialsCitationId: 'CREATE INDEX ix_materialsCitations_materialsCitationId ON materialsCitations (deleted, materialsCitationId)', + ix_materialsCitations_treatmentId : 'CREATE INDEX ix_materialsCitations_treatmentId ON materialsCitations (deleted, treatmentId)', + ix_materialsCitations_collectingDate : 'CREATE INDEX ix_materialsCitations_collectingDate ON materialsCitations (deleted, collectingDate COLLATE NOCASE)', + ix_materialsCitations_collectionCode : 'CREATE INDEX ix_materialsCitations_collectionCode ON materialsCitations (deleted, collectionCode COLLATE NOCASE)', + ix_materialsCitations_collectorName : 'CREATE INDEX ix_materialsCitations_collectorName ON materialsCitations (deleted, collectorName COLLATE NOCASE)', + ix_materialsCitations_country : 'CREATE INDEX ix_materialsCitations_country ON materialsCitations (deleted, country COLLATE NOCASE)', + ix_materialsCitations_collectingRegion : 'CREATE INDEX ix_materialsCitations_collectingRegion ON materialsCitations (deleted, collectingRegion COLLATE NOCASE)', + ix_materialsCitations_municipality : 'CREATE INDEX ix_materialsCitations_municipality ON materialsCitations (deleted, municipality COLLATE NOCASE)', + ix_materialsCitations_county : 'CREATE INDEX ix_materialsCitations_county ON materialsCitations (deleted, county COLLATE NOCASE)', + ix_materialsCitations_stateProvince : 'CREATE INDEX ix_materialsCitations_stateProvince ON materialsCitations (deleted, stateProvince COLLATE NOCASE)', + ix_materialsCitations_location : 'CREATE INDEX ix_materialsCitations_location ON materialsCitations (deleted, location COLLATE NOCASE)', + ix_materialsCitations_locationDeviation : 'CREATE INDEX ix_materialsCitations_locationDeviation ON materialsCitations (deleted, locationDeviation COLLATE NOCASE)', + ix_materialsCitations_specimenCountFemale: 'CREATE INDEX ix_materialsCitations_specimenCountFemale ON materialsCitations (deleted, specimenCountFemale COLLATE NOCASE)', + ix_materialsCitations_specimenCountMale : 'CREATE INDEX ix_materialsCitations_specimenCountMale ON materialsCitations (deleted, specimenCountMale COLLATE NOCASE)', + ix_materialsCitations_specimenCount : 'CREATE INDEX ix_materialsCitations_specimenCount ON materialsCitations (deleted, specimenCount COLLATE NOCASE)', + ix_materialsCitations_specimenCode : 'CREATE INDEX ix_materialsCitations_specimenCode ON materialsCitations (deleted, specimenCode COLLATE NOCASE)', + ix_materialsCitations_typeStatus : 'CREATE INDEX ix_materialsCitations_typeStatus ON materialsCitations (deleted, typeStatus COLLATE NOCASE)', + ix_materialsCitations_determinerName : 'CREATE INDEX ix_materialsCitations_determinerName ON materialsCitations (deleted, determinerName COLLATE NOCASE)', + ix_materialsCitations_collectedFrom : 'CREATE INDEX ix_materialsCitations_collectedFrom ON materialsCitations (deleted, collectedFrom COLLATE NOCASE)', + ix_materialsCitations_collectingMethod : 'CREATE INDEX ix_materialsCitations_collectingMethod ON materialsCitations (deleted, collectingMethod COLLATE NOCASE)', + ix_materialsCitations_latitude : 'CREATE INDEX ix_materialsCitations_latitude ON materialsCitations (deleted, latitude)', + ix_materialsCitations_longitude : 'CREATE INDEX ix_materialsCitations_longitude ON materialsCitations (deleted, longitude)', + ix_materialsCitations_elevation : 'CREATE INDEX ix_materialsCitations_elevation ON materialsCitations (deleted, elevation)', + ix_materialsCitations_deleted : 'CREATE INDEX ix_materialsCitations_deleted ON materialsCitations (deleted)', + ix_treatmentCitations_treatmentCitationId: 'CREATE INDEX ix_treatmentCitations_treatmentCitationId ON treatmentCitations (deleted, treatmentCitationId)', + ix_treatmentCitations_treatmentId : 'CREATE INDEX ix_treatmentCitations_treatmentId ON treatmentCitations (deleted, treatmentId)', + ix_treatmentCitations_deleted : 'CREATE INDEX ix_treatmentCitations_deleted ON treatmentCitations (deleted)', + ix_figureCitations_treatmentId : 'CREATE INDEX ix_figureCitations_treatmentId ON figureCitations (deleted, treatmentId)', + ix_figureCitations_figureCitationId : 'CREATE INDEX ix_figureCitations_figureCitationId ON figureCitations (deleted, figureCitationId, treatmentId)', + ix_bibRefCitations_bibRefCitationId : 'CREATE INDEX ix_bibRefCitations_bibRefCitationId ON bibRefCitations (deleted, bibRefCitationId)', + ix_bibRefCitations_treatmentId : 'CREATE INDEX ix_bibRefCitations_treatmentId ON bibRefCitations (deleted, treatmentId)', + ix_bibRefCitations_deleted : 'CREATE INDEX ix_bibRefCitations_deleted ON bibRefCitations (deleted)', + } + + for (let i in indexes) { + deBugger({ + debug: debug, + type: 'index', + stmt: indexes[i], + table: i, + values: [] + }); + } }, loadFTSTreatments: function() { diff --git a/bin/truebug/database1.js b/bin/truebug/database1.js deleted file mode 100644 index 735bee0..0000000 --- a/bin/truebug/database1.js +++ /dev/null @@ -1,9 +0,0 @@ -'use strict'; - -const config = require('config'); -const plog = require(config.get('plog')); -const dd = require('../../api/v2/lib/dd2datadictionary').dataDictionary; - -for (let resource in dd) { - console.log(resource) -} \ No newline at end of file diff --git a/bin/truebug/download.js b/bin/truebug/download.js index 34b1b95..be9bfb5 100644 --- a/bin/truebug/download.js +++ b/bin/truebug/download.js @@ -4,48 +4,67 @@ const exec = require('child_process').exec; const ProgressBar = require('progress'); const http = require('http'); const fs = require('fs'); +const path = require('path'); const config = require('config'); -const downloadDir = config.get('bin.renew.download.downloadDir'); -const fileName = config.get('bin.renew.download.fileName'); -const host = config.get('bin.renew.download.host'); -const port = config.get('bin.renew.download.port'); -const pathToFile = config.get('bin.renew.download.pathToFile'); - -module.exports = function() { - - process.chdir(downloadDir); - const file = fs.createWriteStream(fileName); - - const req = http.request({ - host: host, - port: port, - path: pathToFile + fileName - }); - - req.on('response', function(res){ - var len = parseInt(res.headers['content-length'], 10); - - console.log(); - let bar = new ProgressBar(' downloading [:bar] :rate/bps :percent :etas', { - complete: '=', - incomplete: ' ', - width: 20, - total: len - }); - - res.on('data', function (chunk) { - bar.tick(chunk.length); - file.write(chunk); +const hostname = config.get('truebug.hostname'); +const download = config.get('truebug.download'); + +module.exports = function(downloadtype) { + + process.chdir('./data/'); + + // a date-time stamp that looks like `[yyyy-mm-dd]-[hh]h[mm]m[ss]s` + const dt = new Date() + .toISOString() + .replace(/\..+/, '') + .replace(/T(\d\d):(\d\d):(\d\d)/, '-$1h$2m$3s'); + + + if ( downloadtype === 'full' ) { + + const ext = '.zip'; + const basename = path.basename(download[downloadtype], ext); + + // rename the source file by adding date-time stamp to its basename + const filename = `${basename}-${dt}${ext}`; + const target = fs.createWriteStream(filename); + const req = http.request({ + hostname: hostname, + path: `/${download[downloadtype]}` }); - - res.on('end', function () { - file.end(); - exec('unzip ' + fileName); - exec('rm ' + fileName); - console.log('\n'); + + req.on('response', function(res) { + const len = parseInt(res.headers['content-length'], 10); + + let bar = new ProgressBar(`downloading ${hostname}/${download[downloadtype]} [:bar] :rate/bps :percent :etas`, { + complete: '=', + incomplete: ' ', + width: 20, + total: len + }); + + res.on('data', function (chunk) { + bar.tick(chunk.length); + target.write(chunk); + }); + + res.on('end', function () { + target.end(); + + console.log(`downloaded ${len} bytes to data/${filename}`); + console.log(`unzipping ${filename} to treatments-${dt}`) + exec(`unzip -q ${filename} -d treatments-${dt}`); + + console.log(`deleting ${filename}`); + exec(`rm ${filename}`); + }); }); - }); - - req.end(); + + req.end(); + + } + else { + + } }; \ No newline at end of file diff --git a/bin/truebug/opts.js b/bin/truebug/opts.js index 2f4e676..262eeb0 100644 --- a/bin/truebug/opts.js +++ b/bin/truebug/opts.js @@ -1,4 +1,5 @@ const argv = require('minimist')(process.argv.slice(2)); +const path = require('path'); let opts = { download: false, @@ -8,7 +9,15 @@ let opts = { parseAll: false } -if (argv.download) opts.download = true; +if (argv.download) { + if (argv.download !== 'full' || argv.download !== 'diff' || argv.download.length !== 32) { + argv.download = false; + } + else { + opts.download = argv.download; + } +}; + if (argv.rearrange) opts.rearrange = true; if (argv.database) opts.database = true; if (argv.parse) opts.parse = argv.parse; @@ -17,13 +26,56 @@ const allParamsFalse = !opts.download && !opts.rearrange && !opts.database && !o if (allParamsFalse || argv.usage) { - console.log(` -usage: truebug --download {false|true} \\ - --rearrange {false|true} \\ - --database {false|true} \\ - --parse {treatment id || n = number of treatments to parse || 'all'}` + console.log(` +truebug v. 2.0. + +Usage: truebug \\ + [--download false || { 'full' || 'diff' || }] \\ + [--rearrange false || true] \\ + [--database false || true] \\ + [--parse treatment_id || n = number of treatments to parse || 'all'] + +truebug is an ETL program that can download treatments incrementally (those changed since it was run last), parse the XMLs, insert the data into the database, and rearrange the treatments into a hierarchical directory structure so any single folder doesn't have too many treatments. + +The default action is to do nothing as all options default to false; + +truebug will change its working directory to ~/zenodeo root and run from there. + +The treatments will be downloaded and unzipped to a directory called ~/zenodeo/data/treatments-[yyyy-mm-dd]T[hh]h[mm]m[ss]s + +--parse can be a treatment_id (GUID), or a number (for the number of treatments to parse) or the word 'all' to parse all the treatments. + +Examples: + +1. Parse specific XMLs + +% truebug --parse 038787DAFFF7FF904BBFF925FD13F9AA +% truebug --parse 730087F21E00FF81FF61FC34FDA561A5 + +2. Parse 5000 XMLs from the treatments dump directory + +% truebug --parse 5000 + +3. Parse all the XMLs in the treatments dump directory + +% truebug --parse 'all' + +4. Parse all XMLs, insert them in the database, and rearrange them in the treatments directory in a hierachical directory structure so they are easy to work with in the filesystem + +% truebug --parse 'all' --database true --rearrange true` ) } +else { + + // make sure the process runs from ~/zenodeo + const truebughome = path.dirname(process.argv[1]); + process.chdir(truebughome); + + // now go up one + process.chdir('../'); + + // ready +} module.exports = opts; \ No newline at end of file diff --git a/bin/truebug/parse.js b/bin/truebug/parse.js index 1ca6531..e05b4d7 100644 --- a/bin/truebug/parse.js +++ b/bin/truebug/parse.js @@ -2,17 +2,20 @@ const fs = require('fs'); const path = require('path'); + const progress = require('progress'); const cheerio = require('cheerio'); const chance = require('chance').Chance(); const config = require('config'); const dataDict = require(config.get('v2.dataDict')); -const xmlDumpDir = config.get('xmlDumpDir'); -const logger = require(config.get('logger')); +const dataDictionary = dataDict.dataDictionary; + +const treatmentsDump = config.get('truebug.treatmentsDump'); + -// truebug modules -const rearrange = require('./rearrange'); +// // truebug modules +// const rearrange = require('./rearrange'); const database = require('./database'); /* @@ -58,38 +61,42 @@ const stats = function(treatments, endProc) { extracted.treatments = extracted.treatments + j; for (; i < j; i++) { + const treatment = treatments[i]; + if (treatment.treatmentCitations) { extracted.treatmentCitations = extracted.treatmentCitations + treatment.treatmentCitations.length; } + if (treatment.treatmentAuthors) { extracted.treatmentAuthors = extracted.treatmentAuthors + treatment.treatmentAuthors.length; } + if (treatment.materialsCitations) { extracted.materialsCitations = extracted.materialsCitations + treatment.materialsCitations.length; } + if (treatment.figureCitations) { extracted.figureCitations = extracted.figureCitations + treatment.figureCitations.length; } + if (treatment.bibRefCitations) { extracted.bibRefCitations = extracted.bibRefCitations + treatment.bibRefCitations.length; } + } - return JSON.stringify(extracted, null, '\t'); + //return JSON.stringify(extracted, null, '\t'); }; const parseOne = function(treatmentId) { - const xml = fs.readFileSync(`${xmlDumpDir}/${treatmentId + '.xml'}`, 'utf8'); - //038787DAFFF7FF904BBFF925FD13F9AA - //730087F21E00FF81FF61FC34FDA561A5 - //const xml = fs.readFileSync(`${process.cwd()}/data/${treatmentId}.xml`, 'utf8'); + const xml = fs.readFileSync(`${treatmentsDump}/${treatmentId + '.xml'}`, 'utf8'); return cheerioparse(xml, treatmentId); }; -// As to the deleted (or retired, or whatever) elements: they are marked with a deleted attribute bearing value true. In addition, they also have deleteUser, deleteTime, and deleteVersion attributes. +// // As to the deleted (or retired, or whatever) elements: they are marked with a deleted attribute bearing value true. In addition, they also have deleteUser, deleteTime, and deleteVersion attributes. const parseTreatmentCitations = function($, treatmentId) { @@ -218,15 +225,18 @@ const _parse = function($, elements, parts, partId, treatmentId) { const missingAttr = []; const entry = {}; - dataDict[parts].forEach(el => { - const attr = $(elements[i]).attr(el.plazi); - if (attr) { - entry[el.plazi] = attr; - } - else { - entry[el.plazi] = ''; - missingAttr.push(el.plazi); + dataDictionary[parts].forEach(el => { + if (el.cheerioElement) { + const attr = $(elements[i]).attr(el.plaziName); + if (attr) { + entry[el.plaziName] = attr; + } + else { + entry[el.plaziName] = ''; + missingAttr.push(el.plaziName); + + } } }); @@ -269,25 +279,38 @@ const parseTreament = function($, treatmentId) { let treatment = {}; - dataDict.treatments.forEach(el => { - let val = eval(el.element) || ''; - if (el.plazi === 'treatmentId') { - val = treatmentId; - } - else if (el.plazi === 'deleted') { - if (val && val === 'true') { - val = 1; - } - else { - val = 0; - } - } - - if (typeof val === 'string') { - treatment[el.plazi] = val ? val.trim() : ''; - } - else { - treatment[el.plazi] = val; + + + dataDictionary.treatments.forEach(el => { + + + if (el.cheerioElement) { + let val = eval(el.cheerioElement) || ''; + + //if (val) { + if (el.plaziName === 'treatmentId') { + + val = treatmentId; + } + else if (el.plaziName === 'deleted') { + + //val = val && val === 'true' ? 1 : 0; + if (val && val === 'true') { + val = 1; + } + else { + val = 0; + } + } + + treatment[el.plaziName] = typeof val === 'string' ? val.trim() : val; + // if (typeof val === 'string') { + // treatment[el.plaziName] = val ? val.trim() : ''; + // } + // else { + // treatment[el.plaziName] = val; + // } + //} } }) @@ -339,8 +362,8 @@ const cheerioparse = function(xml, treatmentId) { // 'treatmentId' to each remaining object so it can be // used as a foreign key to connect the object to the // parent treatment - //const emptyObjs = (el) => Object.keys(el).length > 0; - //const addTreatmentId = (el) => el.treatmentId = treatmentId; + const emptyObjs = (el) => Object.keys(el).length > 0; + const addTreatmentId = (el) => el.treatmentId = treatmentId; let ta = parseTreatmentAuthors($, treatmentId); if (ta.length) { @@ -374,26 +397,34 @@ module.exports = function(n, rearrangeOpt = false, databaseOpt = false) { //const xmlre = /^[0-9a-f]{8}-?[0-9a-f]{4}-?[1-5][0-9a-f]{3}-?[89ab][0-9a-f]{3}-?[0-9a-f]{12}$/i; if (n.length === 32) { + + console.log(`going to parse treatment ${n}`) const treatment = parseOne(n); console.log('----------------------------------------\n') console.log(treatment); } else { - const start = new Date().getTime(); - const xmlsArr = fs.readdirSync(xmlDumpDir); + // const start = new Date().getTime(); + const xmlsArr = fs.readdirSync(treatmentsDump); let i = 0; let j = typeof(n) === 'number' ? n : xmlsArr.length; - // update the progress bar every x% of the total num of files - // but x% of j should not be more than 10000 - let x = 10; - const transactionLimit = 5000; - if ((j / x) > transactionLimit) { - x = Math.floor(j / transactionLimit); + /************************************************************** + * + * update the progress bar every x% of the total num of files + * but x% of j should not be more than 5000 because we don't + * want to insert more than 5K records at a time. + * + **************************************************************/ + + let batch = 1; + if (j > 50) { + batch = Math.floor(j / 10); } - const tickInterval = Math.floor( j / (j / x) ); + if (j > 5000) batch = 5000; + const tickInterval = Math.floor( j / batch ); const bar = new progress('processing [:bar] :rate files/sec :current/:total done (:percent) time left: :etas', { complete: '=', incomplete: ' ', @@ -401,11 +432,11 @@ module.exports = function(n, rearrangeOpt = false, databaseOpt = false) { total: j }); - const batch = Math.floor(j / x); + let treatments = []; - let endProc = false; + console.log(`- parsing XMLs and inserting into the db ${batch} at a time`) for (; i < j; i++) { if (i == (j - 1)) { @@ -428,7 +459,8 @@ module.exports = function(n, rearrangeOpt = false, databaseOpt = false) { if (!(i % batch)) { - bar.interrupt(stats(treatments, endProc) + '\n'); + //bar.interrupt(stats(treatments, endProc) + '\n'); + stats(treatments, endProc) if (databaseOpt) { database.loadData(treatments); @@ -437,28 +469,24 @@ module.exports = function(n, rearrangeOpt = false, databaseOpt = false) { // empty the treatments for the next batch treatments = []; } + } + stats(treatments, endProc); + console.log('finished\n***********************************') + console.log(extracted); + //bar.interrupt(stats(treatments, endProc) + '\n'); + if (databaseOpt) { database.loadData(treatments); - database.indexTables(); + //database.indexTables(); database.loadFTSTreatments(); database.loadFTSFigureCitations(); database.loadFTSBibRefCitations(); } - - console.log('\n\n') - logger({ - host: 'localhost', - start: start, - end: new Date().getTime(), - status: 200, - resource: 'parse', - query: `parsed ${n}`, - message: stats(treatments, endProc) - }) + } }; \ No newline at end of file diff --git a/config/default.js b/config/default.js index 3e30636..9152d50 100644 --- a/config/default.js +++ b/config/default.js @@ -79,7 +79,9 @@ module.exports = { zenodo: 'https://zenodo.org/api' }, - dataDict: path.join(cwd, 'dataDictionary', 'data-dictionary.js'), + //dataDict: path.join(cwd, 'dataDictionary', 'data-dictionary.js'), + dataDict: path.join(cwd, 'api', 'v2', 'lib', 'dd2datadictionary.js'), + schema: path.join(cwd, 'api', 'v2', 'schema.js') }, @@ -87,6 +89,7 @@ module.exports = { data: { logs: path.join(cwd, 'data', 'logs.sqlite'), treatments: path.join(cwd, 'data', 'treatments.sqlite'), + treatmentsTmp: path.join(cwd, 'data', 'treatments-tmp.sqlite'), queryStats: path.join(cwd, 'data', 'queryStats.sqlite'), lookups: path.join(cwd, 'data', 'facets.sqlite'), @@ -111,20 +114,20 @@ module.exports = { { col : 'message', type: 'TEXT' } ], - 'download-program': { - "newTreatmentsDir": path.join(cwd, 'data', 'treatmentsNew'), - "treatmentsListDir": path.join(cwd, 'data'), - "treatmentsListFilename": "listOfTreatments.xml", - "downloadTreatmentsURL": 'http://tb.plazi.org/GgServer/xml/', - "downloadListURL": 'http://tb.plazi.org/GgServer/search?&indexName=0&resultFormat=XML&lastModifiedSince=' - }, - - 'xmlDumpSrc': 'http://tb.plazi.org/GgServer/dumps/plazi.xml.zip', - 'xmlDumpDir': path.join(cwd, 'data', 'treatmentsDump'), - 'dataDict': path.join(cwd, 'dataDictionary', 'data-dictionary.js'), + // 'download-program': { + // "newTreatmentsDir": path.join(cwd, 'data', 'treatmentsNew'), + // "treatmentsListDir": path.join(cwd, 'data'), + // "treatmentsListFilename": "listOfTreatments.xml", + // "downloadTreatmentsURL": 'http://tb.plazi.org/GgServer/xml/', + // "downloadListURL": 'http://tb.plazi.org/GgServer/search?&indexName=0&resultFormat=XML&lastModifiedSince=' + // }, + + // 'xmlDumpSrc': 'http://tb.plazi.org/GgServer/dumps/plazi.xml.zip', + // 'xmlDumpDir': path.join(cwd, 'data', 'treatmentsDump'), + // 'dataDict': path.join(cwd, 'dataDictionary', 'data-dictionary.js'), //'xmlDumpDir': path.join(cwd, 'data', 'treatmentsDump'), //'xmlDumpSrc': 'http://tb.plazi.org/GgServer/dumps/plazi.xml.zip', - 'xmlRearrangedDest': path.join(cwd, 'data', 'treatments') + //'xmlRearrangedDest': path.join(cwd, 'data', 'treatments'), // http://tb.plazi.org/GgServer/srsStats/stats?outputFields=doc.uuid+doc.zenodoDepId+doc.updateUser+doc.updateDate&groupingFields=doc.uuid+doc.zenodoDepId+doc.updateUser+doc.updateDate&orderingFields=doc.updateDate&FP-doc.updateDate=%222020-02-21%22-&format=JSON @@ -133,8 +136,29 @@ module.exports = { // http://tb.plazi.org/GgServer/srsStats/stats?outputFields=doc.uuid+doc.updateDate&groupingFields=doc.uuid+doc.updateDate&orderingFields=doc.updateDate&FP-doc.updateDate=%222020-02-21%22-&format=JSON + truebug: { + + //hostname: 'http://tb.plazi.org/GgServer', + hostname: '127.0.0.1', + + download: { - // tb.plazi.org/GgServer/dumps/plazi.zenodeo.zip - // tb.plazi.org/GgServer/dumps/plazi.xmlHistory.zip + // full: 'plazi.zenodeo.zip' + // example: 'http://tb.plazi.org/GgServer/dumps/plazi.zenodeo.zip' + full: 'dumps/1B1.zip', + + // diff + // example 'http://tb.plazi.org/GgServer/srsStats/stats?outputFields=doc.uuid+doc.updateDate&groupingFields=doc.uuid+doc.updateDate&orderingFields=doc.updateDate&format=JSON&FP-doc.updateDate=%222020-07-03%22' + diff: '/srsStats/stats?outputFields=doc.uuid+doc.updateDate&groupingFields=doc.uuid+doc.updateDate&orderingFields=doc.updateDate&format=JSON&FP-doc.updateDate=', + + + // single download: '8C2D95A59531F2DCB34D5040E36E6566' + // example 'http://tb.plazi.org/GgServer/xml/8C2D95A59531F2DCB34D5040E36E6566' + single: 'xml' + }, + + treatmentsDump: path.join(cwd, 'data', 'treatmentsDump') + + } -}; +}; \ No newline at end of file diff --git a/dataDictionary/resources/treatments.js b/dataDictionary/resources/treatments.js index b233a7c..50c50a5 100644 --- a/dataDictionary/resources/treatments.js +++ b/dataDictionary/resources/treatments.js @@ -97,6 +97,19 @@ module.exports = { validation : 'Joi.string().description(`${d}`).optional()', resourceId : false }, + + // used to be 'author' + { + plaziName : 'articleAuthor', + zenodoName : '', + sqlType : 'TEXT', + cheerioElement: '$("document").attr("masterDocTitle")', + description : 'The author(s) of the article (not necessarily the same as the author of the treatment)', + queryable : 'like', + resourceId : false, + queryString : 'author', + validation : 'Joi.string().description(`${d}`).optional()' + }, { plaziName : 'publicationDate', zenodoName : 'publicationDate', @@ -301,19 +314,20 @@ module.exports = { queryString : 'q', validation : 'Joi.string().description(`${d}`).optional()' }, - { - plaziName : 'author', - zenodoName : '', - sqlName : 'treatmentAuthors.treatmentAuthor', - table : 'treatmentAuthors ON treatments.treatmentId = treatmentAuthors.treatmentId', - sqlType : 'TEXT', - cheerioElement: '', - description : 'The author(s) of the article (not necessarily the same as the author of the treatment)', - queryable : 'like', - resourceId : false, - queryString : 'author', - validation : 'Joi.string().description(`${d}`).optional()' - }, + + // { + // plaziName : 'author', + // zenodoName : '', + // sqlName : 'treatmentAuthors.treatmentAuthor', + // table : 'treatmentAuthors ON treatments.treatmentId = treatmentAuthors.treatmentId', + // sqlType : 'TEXT', + // cheerioElement: '', + // description : 'The author(s) of the article (not necessarily the same as the author of the treatment)', + // queryable : 'like', + // resourceId : false, + // queryString : 'author', + // validation : 'Joi.string().description(`${d}`).optional()' + // }, /***************** */