Skip to content

Commit

Permalink
feat(search): Improved ranking for all schemas (#236)
Browse files Browse the repository at this point in the history
* WIP fix exact match

* add tunable bm25 chunks weightage

* test cases

* add title vectors

* format tests

* update dataset timestamps

* remove stopwords for only bm25

* make attendees name optional

* remove comments

* rewrite comments better

* remove commented code

* remove routree.gen file

* updated routeTree.gen

* skip search test

* remove test data files

* removing normalize_linear from user schema instead use scale function to normalize score

* fix mail_attachment ranking

* added decay for people

* add contants in schemas

* decrease decay for people

* schemas cleanup

* update stopwords

* decrease doc recency decay

* remove routeTree gen changes

* update stopwords list

* use stopwords from orama

* update package

---------

Co-authored-by: Sahebjot singh <[email protected]>
  • Loading branch information
junaid-shirur and zereraz authored Feb 18, 2025
1 parent 02e6a36 commit 0994602
Show file tree
Hide file tree
Showing 11 changed files with 460 additions and 60 deletions.
2 changes: 1 addition & 1 deletion frontend/src/routeTree.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -324,4 +324,4 @@ export const routeTree = rootRoute
}
}
}
ROUTE_MANIFEST_END */
ROUTE_MANIFEST_END */
4 changes: 3 additions & 1 deletion server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"type": "module",
"version": "0.3.0",
"devDependencies": {
"@faker-js/faker": "^9.3.0",
"@biomejs/biome": "1.9.4",
"@types/bun": "latest",
"@types/uuid": "^10.0.0",
Expand Down Expand Up @@ -31,6 +32,7 @@
"@langchain/community": "^0.3.22",
"@langchain/core": "^0.3.27",
"@notionhq/client": "^2.2.15",
"@orama/stopwords": "^3.0.6",
"@paralleldrive/cuid2": "^2.2.2",
"arctic": "^1.9.2",
"autoevals": "^0.0.119",
Expand All @@ -57,8 +59,8 @@
"pino-pretty": "^13.0.0",
"playwright": "^1.45.3",
"postgres": "^3.4.5",
"uuid": "^11.0.0",
"together-ai": "^0.13.0",
"uuid": "^11.0.0",
"zod": "^3.24.1"
}
}
11 changes: 6 additions & 5 deletions server/search/vespa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import type {
VespaSchema,
VespaMailAttachment,
} from "@/search/types"
import { getErrorMessage } from "@/utils"
import { getErrorMessage, removeStopwords } from "@/utils"
import config from "@/config"
import { getLogger } from "@/logger"
import { Subsystem } from "@/types"
Expand Down Expand Up @@ -423,15 +423,16 @@ export const searchVespa = async (
excludedIds,
notInMailLabels,
)

const hybridDefaultPayload = {
yql,
query,
q: query, // Original user input query
query: removeStopwords(query), // removing stopwords for only bm25, to keep semantic meaning for embeddings
email,
"ranking.profile": profile,
"input.query(e)": "embed(@query)",
"input.query(e)": "embed(@q)",
"input.query(alpha)": alpha,
"input.query(bm25ChunkWeight)": 0.7,
hits: limit,
alpha,
...(offset
? {
offset,
Expand Down
1 change: 1 addition & 0 deletions server/shared/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ export const EventResponseSchema = VespaEventSchema.pick({
type: z.literal(eventSchema),
relevance: z.number(),
description: z.string().optional(),
attendeesNames: z.array(z.string()).optional(),
})
.strip()

Expand Down
263 changes: 263 additions & 0 deletions server/tests/search.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
import { VespaSearchResponseToSearchResult } from "@/search/mappers"
import type {
VespaAutocompleteUser,
VespaEventSearch,
VespaFileSearch,
VespaMailSearch,
VespaSearchResponse,
} from "@/search/types"
import { searchVespa } from "@/search/vespa"
import type { SearchResponse } from "@/shared/types"
import { describe, expect, test, beforeAll, mock, beforeEach } from "bun:test"

type MatchFeatures = {
"query(alpha)": number
"query(bm25ChunkWeight)": number
chunk_vector_score: number
doc_recency: number
scaled_bm25_chunks: number
scaled_bm25_title: number
title_vector_score: number
}

const user = "[email protected]"

const search = async (query: string): Promise<SearchResponse> => {
return VespaSearchResponseToSearchResult(
await searchVespa(query, user, null, null, 10, 0),
)
}

describe.skip("search events", () => {
test("verbatim name search", async () => {
const query = "Task pull business value."

const searchResults = await search(query)
expect(
searchResults.results[0].type === "event" &&
searchResults.results[0].name,
).toBe(query)
})

test("partial name search", async () => {
// event fullname "Expert win prove brother situation."
const query = "Expert win prove"

const searchResults = await (await search(query)).results

expect(
searchResults[0].type === "event" &&
searchResults[0].name.includes(query),
).toBeTrue()
})

test("search with description", async () => {
const query =
"Thus name high. Space whatever little develop student democratic. Heart whom baby decide reality in forget."

const searchResults = (await search(query)).results
expect(
searchResults[0].type === "event" && searchResults[0].description,
).toBe(query)
})

test("search with partial description", async () => {
const query = "Along say want. Yes receive statement bill Republican."

const searchResults = (await search(query)).results
expect(
searchResults[0].type === "event" &&
searchResults[0].description?.includes(query),
).toBeTrue()
})

test("search with partial name and partial description", async () => {
const query = "Picture fine Probably certain so heart"

const searchResults = (await search(query)).results
expect(
searchResults[0].type === "event" &&
searchResults[0].name.includes("Picture fine"),
).toBeTruthy()
})

test("search with attendees names as a phrase", async () => {
// the event contains attendees "Jonathon Phillips" ; searching just with their firstnames and partial event name "Their test talk face out.",
const query = "test talk with Jonathon"
const searchResults = (await search(query)).results

const correctDocIndex = searchResults.findIndex(
(i) => i.type === "event" && i.name === "Their test talk face out.",
)

expect(correctDocIndex).toBeLessThan(3)
expect(
searchResults[correctDocIndex].type === "event" &&
searchResults[correctDocIndex].name,
).toBe("Their test talk face out.")
expect(
searchResults[correctDocIndex].type === "event" &&
searchResults[correctDocIndex].attendeesNames!.includes(
"Jonathon Phillips",
),
).toBeTruthy()
})
})

describe.skip("search mails", () => {
test("exact subject match", async () => {
const query = "Citizen while suddenly phone recently analysis."

const searchResults = (await search(query)).results
expect(searchResults[0].type === "mail" && searchResults[0].subject).toBe(
query,
)
})

test("partial subject match", async () => {
const query = "Prevent force difference kid"
const searchResults = (await search(query)).results

expect(
searchResults[0].type === "mail" &&
searchResults[0].subject.includes(query),
).toBe(true)
})

// TODO: fix chunks search
test("mail chunks search", async () => {
// mail has the subject "Fill far main energy industry however simply form. "
const query =
"McCloud is an American television police drama that aired on NBC from 1970-77"
const searchResults = (await search(query)).results
const correctDocIndex = searchResults.findIndex(
(i) =>
i.type === "mail" &&
i.subject === "Fill far main energy industry however simply form.",
)

expect(correctDocIndex).toBeLessThan(5)
})
})

describe.skip("search files", () => {
test("verbatim title search", async () => {
const query = "From ACH direct debit to Prepaid card?"
const searchResults = (await search(query)).results

expect(searchResults[0].type === "file" && searchResults[0].title).toBe(
query,
)
})

test("partial title match", async () => {
const query = "Who maintains receipt"
const searchResults = (await search(query)).results

expect(
searchResults[0].type === "file" &&
searchResults[0].title.includes(query),
).toBe(true)
})

test("fuzzy search", async () => {
// title of the doc search
const docTitleSearchedFor =
"What tax-free retirement accounts are available for self-employed individuals?"
const query = "tax-free retirement self employed"
const searchResults = (await search(query)).results
const correctDocIndex = searchResults.findIndex(
(i) => i.type === "file" && i.title === docTitleSearchedFor,
)
expect(correctDocIndex).toBeLessThanOrEqual(3)
})

test("out-of-order search", async () => {
// title of the doc search
const docTitleSearchedFor =
"Are Investment Research websites worth their premiums?"
const query = "investment premiums worth"
const searchResults = (await search(query)).results
const correctDocIndex = searchResults.findIndex(
(i) => i.type === "file" && i.title === docTitleSearchedFor,
)

expect(correctDocIndex).toBeLessThan(3)
})

test("chunks match", async () => {
// title of the doc
const chunkDocTitle =
"What standards should I expect of my CPA when an error was made?"
const query =
"I haven't spoken to Kwame since he went off to HBS, but I did get an invitation to his graduation"
//@ts-ignore
const searchResults: {
fields: VespaFileSearch & { matchfeatures: MatchFeatures }
}[] = (await searchVespa(query, user, null, null, 10, 0)).root.children
const correctDocIndex = searchResults.findIndex(
(i) => i.fields?.title === chunkDocTitle,
)

expect(
searchResults[correctDocIndex].fields.matchfeatures.scaled_bm25_chunks >
0.9,
).toBeTrue()
expect(correctDocIndex).toBeLessThan(3)
})

test("semantic search", async () => {
const query = "what are north korea note worthy things"
//@ts-ignore
const searchResults: {
fields: VespaFileSearch & { matchfeatures: MatchFeatures }
}[] = (await searchVespa(query, user, null, null, 10, 0)).root.children
const fileIdx = searchResults.findIndex(
(i) => i.fields.sddocname === "file",
)

expect(fileIdx).toBeLessThan(3)
expect(
searchResults[fileIdx].fields.matchfeatures.chunk_vector_score > 0.5,
).toBeTrue()
})

test("recent document should have higher rank", async () => {
const query = "claim mileage for traveling"
const searchResults = (await search(query)).results
const doc1 = searchResults[0].type == "file" && searchResults[0].updatedAt || 0
const doc2 = searchResults[1].type == "file" && searchResults[1].updatedAt || 0

expect(doc1 > doc2).toBeTrue()
})

})

describe.skip("people search", () => {
test("name match", async () => {
const query = "Brenda Molina"
const searchResults = await (await search(query)).results

expect(searchResults[0].type === "user" && searchResults[0].name).toBe(
query,
)
})

test("email match", async () => {
const query = "[email protected]"
const searchResults = (await search(query)).results

expect(searchResults[0].type === "user" && searchResults[0].email).toBe(
query,
)
})

test("retrieves user document by name-based query", async () => {
const query = "get the contact of Kim Calhoun"
const searchResults = (await search(query)).results

expect(searchResults[0].type === "user" && searchResults[0].name).toBe(
"Kim Calhoun",
)
})
})
12 changes: 12 additions & 0 deletions server/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import type { CookieOptions } from "hono/utils/cookie"
import fs from "node:fs/promises"
import { getLogger } from "@/logger"
import { Subsystem } from "@/types"
import { stopwords as englishStopwords } from "@orama/stopwords/english";

const Logger = getLogger(Subsystem.Utils)

Expand Down Expand Up @@ -166,3 +167,14 @@ export const splitGroupedCitationsWithSpaces = (text: string): string => {
},
)
}

export const removeStopwords = (text: string) => {
const words = text.split(/\s+/)

// Filter out stopwords
const filteredWords = words.filter((word) => {
const cleanWord = word.toLowerCase().replace(/[^\w]/g, "")
return !englishStopwords.includes(cleanWord)
})
return filteredWords.join(" ")
}
Loading

0 comments on commit 0994602

Please sign in to comment.