forked from johnfactotum/foliate-js
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.js
114 lines (108 loc) · 4.62 KB
/
search.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// length for context in excerpts
const CONTEXT_LENGTH = 50
const normalizeWhitespace = str => str.replace(/\s+/g, ' ')
const makeExcerpt = (strs, { startIndex, startOffset, endIndex, endOffset }) => {
const start = strs[startIndex]
const end = strs[endIndex]
const match = start === end
? start.slice(startOffset, endOffset)
: start.slice(startOffset)
+ strs.slice(start + 1, end).join('')
+ end.slice(0, endOffset)
const trimmedStart = normalizeWhitespace(start.slice(0, startOffset)).trimStart()
const trimmedEnd = normalizeWhitespace(end.slice(endOffset)).trimEnd()
const ellipsisPre = trimmedStart.length < CONTEXT_LENGTH ? '' : '…'
const ellipsisPost = trimmedEnd.length < CONTEXT_LENGTH ? '' : '…'
const pre = `${ellipsisPre}${trimmedStart.slice(-CONTEXT_LENGTH)}`
const post = `${trimmedEnd.slice(0, CONTEXT_LENGTH)}${ellipsisPost}`
return { pre, match, post }
}
// TODO: maybe use this for exact matches as it would be faster
/*
export const simpleSearch = function* (strs, query, locales = 'en') {
const haystack = strs.join('')
const lowerHaystack = haystack.toLocaleLowerCase(locales)
const needle = query.toLocaleLowerCase(locales)
const needleLength = needle.length
let index = -1
do {
index = lowerHaystack.indexOf(needle, index + 1)
if (index > -1) {
const end = index + needleLength
// TODO
}
} while (index > -1)
}
*/
const segmenterSearch = function* (strs, query, options = {}) {
const { locales = 'en', granularity = 'word', sensitivity = 'base' } = options
let segmenter, collator
try {
segmenter = new Intl.Segmenter(locales, { usage: 'search', granularity })
collator = new Intl.Collator(locales, { sensitivity })
} catch (e) {
console.warn(e)
segmenter = new Intl.Segmenter('en', { usage: 'search', granularity })
collator = new Intl.Collator('en', { sensitivity })
}
const queryLength = Array.from(segmenter.segment(query)).length
const substrArr = []
let strIndex = 0
let segments = segmenter.segment(strs[strIndex])[Symbol.iterator]()
main: while (strIndex < strs.length) {
while (substrArr.length < queryLength) {
const { done, value } = segments.next()
if (done) {
// the current string is exhausted
// move on to the next string
strIndex++
if (strIndex < strs.length) {
segments = segmenter.segment(strs[strIndex])[Symbol.iterator]()
continue
} else break main
}
const { index, segment } = value
// ignore formatting characters
if (!/[^\p{Format}]/u.test(segment)) continue
// normalize whitespace
if (/\s/u.test(segment)) {
if (!/\s/u.test(substrArr[substrArr.length - 1]?.segment))
substrArr.push({ strIndex, index, segment: ' ' })
continue
}
value.strIndex = strIndex
substrArr.push(value)
}
const substr = substrArr.map(x => x.segment).join('')
if (collator.compare(query, substr) === 0) {
const endIndex = strIndex
const lastSeg = substrArr[substrArr.length - 1]
const endOffset = lastSeg.index + lastSeg.segment.length
const startIndex = substrArr[0].strIndex
const startOffset = substrArr[0].index
const range = { startIndex, startOffset, endIndex, endOffset }
yield { range, excerpt: makeExcerpt(strs, range) }
}
substrArr.shift()
}
}
export const searchMatcher = (textWalker, opts) => {
const { defalutLocale, matchCase, matchDiacritics, matchWholeWords } = opts
return function* (doc, query) {
const iter = textWalker(doc, function* (strs, makeRange) {
for (const result of segmenterSearch(strs, query, {
locales: doc.body.lang || doc.documentElement.lang || defalutLocale || 'en',
granularity: matchWholeWords ? 'word' : 'grapheme',
sensitivity: matchDiacritics && matchCase ? 'variant'
: matchDiacritics && !matchCase ? 'accent'
: !matchDiacritics && matchCase ? 'case'
: 'base',
})) {
const { startIndex, startOffset, endIndex, endOffset } = result.range
result.range = makeRange(startIndex, startOffset, endIndex, endOffset)
yield result
}
})
for (const result of iter) yield result
}
}