Add wordforms/search_suggest

johnjcamilleri · johnjcamilleri · commit 5fb002f9618a · 2020-07-04T20:56:25.000+02:00
Results of lexemes/search_suggest are wrapped in 'lexeme' for consistency.
diff --git a/README.md b/README.md
@@ -39,6 +39,16 @@ Run all tests with `npm test`.
 Run an individual testsuite with `npx mocha --exit test/schema.js` or use the `--grep` flag.
 To stop on first failure, use `--bail`
 
+### Using test data
+
+1. Set DB URL in `server-config.js` to `...gabra-test` (or something else)
+2. ```
+node scripts/node/populate.js test/data/*.json
+node scripts/node/resolve-lexeme-ids.js
+node scripts/node/create-indexes.js
+(cd scripts/node && ./run.js update-glosses-collection.js)
+```
+
 ## Repository
 
 - `master` branch is used for development.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "gabra-api",
-  "version": "2.12.0",
+  "version": "2.13.0",
   "description": "Ġabra: an open lexicon for Maltese",
   "author": "John J. Camilleri <john@johnjcamilleri.com> (http://johnjcamilleri.com/)",
   "license": "MIT",
diff --git a/public/markdown/api.md b/public/markdown/api.md
@@ -96,12 +96,14 @@ The results are sorted by part of speech and derived form, and will not include
 |:-----------------|:------------|:---------------------------|
 | `:id` (required) | Lexeme ID   | `5200a366e36f237975000f26` |
 
-### Search suggest
+### Search suggest <small>Changed in v2.13</small>
 
-List variations in spelling (diacritics, character case) of a search term, from lemmas:
+Find matching words which vary in spelling (diacritics, character case) of the search term, from lemmas or wordforms:
 
 > [/lexemes/search_suggest?s=Hareg](#{baseURL}/lexemes/search_suggest?s=Hareg)
 
+> [/wordforms/search_suggest?s=ohorgu](#{baseURL}/wordforms/search_suggest?s=ohorgu)
+
 | Argument       | Description  | Example |
 |:---------------|:-------------|:--------|
 | `s` (required) | Search query | `Hareg` |
diff --git a/routes/lexemes.js b/routes/lexemes.js
@@ -178,24 +178,24 @@ router.get('/search', function (req, res) {
         collection.find(conds_l, opts),
         collection.count(conds_l)
       ])
-    .then(values => {
-      var docs = values[0]
-      var count = values[1]
-      queryObj.result_count = count
-      res.json({
-        'results': docs.map(doc => {
-          return {
-            'lexeme': doc
-          }
-        }),
-        'query': queryObj
-      })
-    })
-    .catch(err => {
-      console.error(err)
-      res.status(500).end()
+        .then(values => {
+          var docs = values[0]
+          var count = values[1]
+          queryObj.result_count = count
+          res.json({
+            'results': docs.map(doc => {
+              return {
+                'lexeme': doc
+              }
+            }),
+            'query': queryObj
+          })
+        })
+        .catch(err => {
+          console.error(err)
+          res.status(500).end()
+        })
     })
-  })
 })
 
 /*
@@ -312,17 +312,16 @@ router.get('/search_suggest', function (req, res) {
   // s = s.replace(/^([^\[])/, function (m,c,o,s) { return '[' + c.toUpperCase() + ']'})
 
   // Handle diacritics
-  s = s.replace(/^\^/, '')
-  s = s.replace(/\$$/, '')
   s = s.replace(/c/g, 'ċ')
   s = s.replace(/g/g, '[gġ]')
   s = s.replace(/h/g, '[hħ]')
   s = s.replace(/z/g, '[zż]')
 
   // No substrings
+  s = s.replace(/^\^/, '')
+  s = s.replace(/\$$/, '')
   s = '^' + s + '$'
 
-  var collection = db.get('lexemes')
   var query = {
     '$or': [
       {
@@ -334,23 +333,24 @@ router.get('/search_suggest', function (req, res) {
     ],
     'pending': {'$ne': true}
   }
+
   var opts = {
     'projection': {'lemma': true}
   }
-  collection.find(query, opts, function (err, docs) {
-    if (err) {
+  db.get('lexemes').find(query, opts)
+    .catch(function (err) {
       console.error(err)
       res.status(500).end()
-      return
-    }
-    res.json({
-      'results': docs,
-      'query': {
-        'term': orig,
-        'result_count': docs.length
-      }
     })
-  })
+    .then(function (data) {
+      res.json({
+        'results': data.map((l) => { return {'lexeme': l} }),
+        'query': {
+          'term': orig,
+          'result_count': data.length
+        }
+      })
+    })
 })
 
 /*
diff --git a/routes/wordforms.js b/routes/wordforms.js
@@ -2,6 +2,7 @@ var express = require('express')
 var router = express.Router()
 var passport = require('passport')
 var async = require('async')
+var regexquote = require('regexp-quote')
 var monk = require('monk')
 
 var log = require('./helpers/logger').makeLogger('wordforms')
@@ -154,6 +155,57 @@ router.post('/replace/:lexeme_id',
     })
   })
 
+/*
+ * GET search suggest
+ */
+router.get('/search_suggest', function (req, res) {
+  var db = req.db
+
+  var orig = req.query.s
+  var s = regexquote(orig)
+
+  // Handle capitalisation
+  s = s.toLowerCase()
+  // s = s.replace(/^\[(.+?)\]/, function (m,c,o,s) { return '[' + c.toLowerCase() + c.toUpperCase() + ']'})
+  // s = s.replace(/^([^\[])/, function (m,c,o,s) { return '[' + c.toUpperCase() + ']'})
+
+  // Handle diacritics
+  s = s.replace(/c/g, 'ċ')
+  s = s.replace(/g/g, '[gġ]')
+  s = s.replace(/h/g, '[hħ]')
+  s = s.replace(/z/g, '[zż]')
+
+  // No substrings
+  s = s.replace(/^\^/, '')
+  s = s.replace(/\$$/, '')
+  s = '^' + s + '$'
+
+  var query = {
+    'surface_form': {'$regex': s, '$ne': orig},
+    'pending': {'$ne': true}
+  }
+  var opts = {
+    'projection': {
+      'surface_form': true,
+      'lexeme_id': true
+    }
+  }
+  db.get('wordforms').find(query, opts)
+    .catch(function (err) {
+      console.error(err)
+      res.status(500).end()
+    })
+    .then(function (data) {
+      res.json({
+        'results': data.map((l) => { return {'wordform': l} }),
+        'query': {
+          'term': orig,
+          'result_count': data.length
+        }
+      })
+    })
+})
+
 /*
  * GET count
  */
diff --git a/test/data/wordforms.json b/test/data/wordforms.json
@@ -123,5 +123,79 @@
         "Camilleri2013"
     ],
     "pending" : true
+},
+{
+  "aspect": "perf",
+  "dir_obj": null,
+  "generated": true,
+  "ind_obj": null,
+  "lexeme": {
+    "lemma": "ħareġ",
+    "pos": "VERB"
+  },
+  "phonetic": "hrɪʧt",
+  "polarity": "pos",
+  "sources": ["Camilleri2013"],
+  "subject": {
+    "person": "p1",
+    "number": "sg"
+  },
+  "surface_form": "ħriġt"
+},
+{
+  "aspect": "perf",
+  "dir_obj": null,
+  "generated": true,
+  "ind_obj": null,
+  "lexeme": {
+    "lemma": "ħareġ",
+    "pos": "VERB"
+  },
+  "phonetic": "hrɪʧt",
+  "polarity": "pos",
+  "sources": ["Camilleri2013"],
+  "subject": {
+    "person": "p2",
+    "number": "sg"
+  },
+  "surface_form": "ħriġt"
+},
+{
+  "aspect": "perf",
+  "dir_obj": null,
+  "generated": true,
+  "ind_obj": null,
+  "lexeme": {
+    "lemma": "ħareġ",
+    "pos": "VERB"
+  },
+  "phonetic": "hɐrɛʧ",
+  "polarity": "pos",
+  "sources": ["Camilleri2013"],
+  "subject": {
+    "person": "p3",
+    "number": "sg",
+    "gender": "m"
+  },
+  "surface_form": "ħareġ"
+},
+{
+  "aspect": "perf",
+  "dir_obj": null,
+  "generated": true,
+  "ind_obj": null,
+  "lexeme": {
+    "lemma": "ħareġ",
+    "pos": "VERB"
+  },
+  "phonetic": "hɐrʤɛt",
+  "polarity": "pos",
+  "sources": ["Camilleri2013"],
+  "subject": {
+    "person": "p3",
+    "number": "sg",
+    "gender": "f"
+  },
+  "surface_form": "ħarġet"
 }
 ]
diff --git a/test/search.js b/test/search.js
@@ -22,13 +22,23 @@ describe('Search', function () {
         res.body.query.result_count.should.be.greaterThanOrEqual(opts.result_count)
       }
 
-      // Results should contain these lemmas (in any order)
+      // Lexeme results should contain these lemmas (in any order)
       if (opts.lemmas) {
         for (let i in opts.lemmas) {
           let lemma = opts.lemmas[i]
           res.body.results.should.matchAny(function (value) {
             value.lexeme.lemma.should.equal(lemma)
-          }, 'lemma "' + lemma + '" not found in results')
+          }, `lemma "${lemma}" not found in results`)
+        }
+      }
+
+      // Wordform results should contain these surface forms (in any order)
+      if (opts.surface_forms) {
+        for (let i in opts.surface_forms) {
+          let sf = opts.surface_forms[i]
+          res.body.results.should.matchAny(function (value) {
+            value.wordform.surface_form.should.equal(sf)
+          }, `surface form "${sf}" not found in results`)
         }
       }
 
@@ -93,6 +103,24 @@ describe('Search', function () {
 
   // -------------------------------------------------------------------------
 
+  describe('Search suggest', function () {
+    it('suggest lexeme', function (done) {
+      request(server)
+        .get('/lexemes/search_suggest?s=Hareg')
+        .expect(200)
+        .end(checkResponse({lemmas: ['ħareġ']}, done))
+    })
+
+    it('suggest wordform', function (done) {
+      request(server)
+        .get('/wordforms/search_suggest?s=harget')
+        .expect(200)
+        .end(checkResponse({surface_forms: ['ħarġet']}, done))
+    })
+  })
+
+  // -------------------------------------------------------------------------
+
   describe('Load stuff', function () {
     var lexeme_id
 

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "gabra-api",`
`3`		`- "version": "2.12.0",`
	`3`	`+ "version": "2.13.0",`
`4`	`4`	`"description": "Ġabra: an open lexicon for Maltese",`
`5`	`5`	`"author": "John J. Camilleri <[email protected]> (http://johnjcamilleri.com/)",`
`6`	`6`	`"license": "MIT",`