feat(search): Improved ranking for all schemas (#236)

* WIP fix exact match * add tunable bm25 chunks weightage * test cases * add title vectors * format tests * update dataset timestamps * remove stopwords for only bm25 * make attendees name optional * remove comments * rewrite comments better * remove commented code * remove routree.gen file * updated routeTree.gen * skip search test * remove test data files * removing normalize_linear from user schema instead use scale function to normalize score * fix mail_attachment ranking * added decay for people * add contants in schemas * decrease decay for people * schemas cleanup * update stopwords * decrease doc recency decay * remove routeTree gen changes * update stopwords list * use stopwords from orama * update package --------- Co-authored-by: Sahebjot singh <[email protected]>
xynehq · Feb 18, 2025 · 0994602 · 0994602
1 parent 02e6a36
commit 0994602
Show file tree

Hide file tree

Showing 11 changed files with 460 additions and 60 deletions.
diff --git a/frontend/src/routeTree.gen.ts b/frontend/src/routeTree.gen.ts
@@ -324,4 +324,4 @@ export const routeTree = rootRoute
     }
   }
 }
-ROUTE_MANIFEST_END */
+ROUTE_MANIFEST_END */
diff --git a/server/package.json b/server/package.json
@@ -4,6 +4,7 @@
   "type": "module",
   "version": "0.3.0",
   "devDependencies": {
+    "@faker-js/faker": "^9.3.0",
     "@biomejs/biome": "1.9.4",
     "@types/bun": "latest",
     "@types/uuid": "^10.0.0",
@@ -31,6 +32,7 @@
     "@langchain/community": "^0.3.22",
     "@langchain/core": "^0.3.27",
     "@notionhq/client": "^2.2.15",
+    "@orama/stopwords": "^3.0.6",
     "@paralleldrive/cuid2": "^2.2.2",
     "arctic": "^1.9.2",
     "autoevals": "^0.0.119",
@@ -57,8 +59,8 @@
     "pino-pretty": "^13.0.0",
     "playwright": "^1.45.3",
     "postgres": "^3.4.5",
-    "uuid": "^11.0.0",
     "together-ai": "^0.13.0",
+    "uuid": "^11.0.0",
     "zod": "^3.24.1"
   }
 }
diff --git a/server/search/vespa.ts b/server/search/vespa.ts
@@ -24,7 +24,7 @@ import type {
   VespaSchema,
   VespaMailAttachment,
 } from "@/search/types"
-import { getErrorMessage } from "@/utils"
+import { getErrorMessage, removeStopwords } from "@/utils"
 import config from "@/config"
 import { getLogger } from "@/logger"
 import { Subsystem } from "@/types"
@@ -423,15 +423,16 @@ export const searchVespa = async (
     excludedIds,
     notInMailLabels,
   )
-
   const hybridDefaultPayload = {
     yql,
-    query,
+    q: query, // Original user input query
+    query: removeStopwords(query), // removing stopwords for only bm25, to keep semantic meaning for embeddings
     email,
     "ranking.profile": profile,
-    "input.query(e)": "embed(@query)",
+    "input.query(e)": "embed(@q)",
+    "input.query(alpha)": alpha,
+    "input.query(bm25ChunkWeight)": 0.7,
     hits: limit,
-    alpha,
     ...(offset
       ? {
           offset,

diff --git a/server/shared/types.ts b/server/shared/types.ts
@@ -209,6 +209,7 @@ export const EventResponseSchema = VespaEventSchema.pick({
     type: z.literal(eventSchema),
     relevance: z.number(),
     description: z.string().optional(),
+    attendeesNames: z.array(z.string()).optional(),
   })
   .strip()
 

diff --git a/server/tests/search.test.ts b/server/tests/search.test.ts
@@ -0,0 +1,263 @@
+import { VespaSearchResponseToSearchResult } from "@/search/mappers"
+import type {
+  VespaAutocompleteUser,
+  VespaEventSearch,
+  VespaFileSearch,
+  VespaMailSearch,
+  VespaSearchResponse,
+} from "@/search/types"
+import { searchVespa } from "@/search/vespa"
+import type { SearchResponse } from "@/shared/types"
+import { describe, expect, test, beforeAll, mock, beforeEach } from "bun:test"
+
+type MatchFeatures = {
+  "query(alpha)": number
+  "query(bm25ChunkWeight)": number
+  chunk_vector_score: number
+  doc_recency: number
+  scaled_bm25_chunks: number
+  scaled_bm25_title: number
+  title_vector_score: number
+}
+
+const user = "[email protected]"
+
+const search = async (query: string): Promise<SearchResponse> => {
+  return VespaSearchResponseToSearchResult(
+    await searchVespa(query, user, null, null, 10, 0),
+  )
+}
+
+describe.skip("search events", () => {
+  test("verbatim name search", async () => {
+    const query = "Task pull business value."
+
+    const searchResults = await search(query)
+    expect(
+      searchResults.results[0].type === "event" &&
+        searchResults.results[0].name,
+    ).toBe(query)
+  })
+
+  test("partial name search", async () => {
+    // event fullname "Expert win prove brother situation."
+    const query = "Expert win prove"
+
+    const searchResults = await (await search(query)).results
+
+    expect(
+      searchResults[0].type === "event" &&
+        searchResults[0].name.includes(query),
+    ).toBeTrue()
+  })
+
+  test("search with description", async () => {
+    const query =
+      "Thus name high. Space whatever little develop student democratic. Heart whom baby decide reality in forget."
+
+    const searchResults = (await search(query)).results
+    expect(
+      searchResults[0].type === "event" && searchResults[0].description,
+    ).toBe(query)
+  })
+
+  test("search with partial description", async () => {
+    const query = "Along say want. Yes receive statement bill Republican."
+
+    const searchResults = (await search(query)).results
+    expect(
+      searchResults[0].type === "event" &&
+        searchResults[0].description?.includes(query),
+    ).toBeTrue()
+  })
+
+  test("search with partial name and partial description", async () => {
+    const query = "Picture fine Probably certain so heart"
+
+    const searchResults = (await search(query)).results
+    expect(
+      searchResults[0].type === "event" &&
+        searchResults[0].name.includes("Picture fine"),
+    ).toBeTruthy()
+  })
+
+  test("search with attendees names as a phrase", async () => {
+    // the event contains attendees "Jonathon Phillips" ; searching just with their firstnames and partial event name "Their test talk face out.",
+    const query = "test talk with Jonathon"
+    const searchResults = (await search(query)).results
+
+    const correctDocIndex = searchResults.findIndex(
+      (i) => i.type === "event" && i.name === "Their test talk face out.",
+    )
+
+    expect(correctDocIndex).toBeLessThan(3)
+    expect(
+      searchResults[correctDocIndex].type === "event" &&
+        searchResults[correctDocIndex].name,
+    ).toBe("Their test talk face out.")
+    expect(
+      searchResults[correctDocIndex].type === "event" &&
+        searchResults[correctDocIndex].attendeesNames!.includes(
+          "Jonathon Phillips",
+        ),
+    ).toBeTruthy()
+  })
+})
+
+describe.skip("search mails", () => {
+  test("exact subject match", async () => {
+    const query = "Citizen while suddenly phone recently analysis."
+
+    const searchResults = (await search(query)).results
+    expect(searchResults[0].type === "mail" && searchResults[0].subject).toBe(
+      query,
+    )
+  })
+
+  test("partial subject match", async () => {
+    const query = "Prevent force difference kid"
+    const searchResults = (await search(query)).results
+
+    expect(
+      searchResults[0].type === "mail" &&
+        searchResults[0].subject.includes(query),
+    ).toBe(true)
+  })
+
+  // TODO: fix chunks search
+  test("mail chunks search", async () => {
+    // mail has the subject "Fill far main energy industry however simply form. "
+    const query =
+      "McCloud is an American television police drama that aired on NBC from 1970-77"
+    const searchResults = (await search(query)).results
+    const correctDocIndex = searchResults.findIndex(
+      (i) =>
+        i.type === "mail" &&
+        i.subject === "Fill far main energy industry however simply form.",
+    )
+
+    expect(correctDocIndex).toBeLessThan(5)
+  })
+})
+
+describe.skip("search files", () => {
+  test("verbatim title search", async () => {
+    const query = "From ACH direct debit to Prepaid card?"
+    const searchResults = (await search(query)).results
+
+    expect(searchResults[0].type === "file" && searchResults[0].title).toBe(
+      query,
+    )
+  })
+
+  test("partial title match", async () => {
+    const query = "Who maintains receipt"
+    const searchResults = (await search(query)).results
+
+    expect(
+      searchResults[0].type === "file" &&
+        searchResults[0].title.includes(query),
+    ).toBe(true)
+  })
+
+  test("fuzzy search", async () => {
+    // title of the doc search
+    const docTitleSearchedFor =
+      "What tax-free retirement accounts are available for self-employed individuals?"
+    const query = "tax-free retirement self employed"
+    const searchResults = (await search(query)).results
+    const correctDocIndex = searchResults.findIndex(
+      (i) => i.type === "file" && i.title === docTitleSearchedFor,
+    )
+    expect(correctDocIndex).toBeLessThanOrEqual(3)
+  })
+
+  test("out-of-order search", async () => {
+    // title of the doc search
+    const docTitleSearchedFor =
+      "Are Investment Research websites worth their premiums?"
+    const query = "investment premiums worth"
+    const searchResults = (await search(query)).results
+    const correctDocIndex = searchResults.findIndex(
+      (i) => i.type === "file" && i.title === docTitleSearchedFor,
+    )
+
+    expect(correctDocIndex).toBeLessThan(3)
+  })
+
+  test("chunks match", async () => {
+    // title of the doc
+    const chunkDocTitle =
+      "What standards should I expect of my CPA when an error was made?"
+    const query =
+      "I haven't spoken to Kwame since he went off to HBS, but I did get an invitation to his graduation"
+    //@ts-ignore
+    const searchResults: {
+      fields: VespaFileSearch & { matchfeatures: MatchFeatures }
+    }[] = (await searchVespa(query, user, null, null, 10, 0)).root.children
+    const correctDocIndex = searchResults.findIndex(
+      (i) => i.fields?.title === chunkDocTitle,
+    )
+
+    expect(
+      searchResults[correctDocIndex].fields.matchfeatures.scaled_bm25_chunks >
+        0.9,
+    ).toBeTrue()
+    expect(correctDocIndex).toBeLessThan(3)
+  })
+
+  test("semantic search", async () => {
+    const query = "what are north korea note worthy things"
+    //@ts-ignore
+    const searchResults: {
+      fields: VespaFileSearch & { matchfeatures: MatchFeatures }
+    }[] = (await searchVespa(query, user, null, null, 10, 0)).root.children
+    const fileIdx = searchResults.findIndex(
+      (i) => i.fields.sddocname === "file",
+    )
+
+    expect(fileIdx).toBeLessThan(3)
+    expect(
+      searchResults[fileIdx].fields.matchfeatures.chunk_vector_score > 0.5,
+    ).toBeTrue()
+  })
+
+  test("recent document should have higher rank", async () => {
+    const query = "claim mileage for traveling"
+    const searchResults = (await search(query)).results
+    const doc1 = searchResults[0].type == "file" && searchResults[0].updatedAt || 0
+    const doc2 = searchResults[1].type == "file" && searchResults[1].updatedAt || 0
+
+    expect(doc1 > doc2).toBeTrue()
+  })
+
+})
+
+describe.skip("people search", () => {
+  test("name match", async () => {
+    const query = "Brenda Molina"
+    const searchResults = await (await search(query)).results
+
+    expect(searchResults[0].type === "user" && searchResults[0].name).toBe(
+      query,
+    )
+  })
+
+  test("email match", async () => {
+    const query = "[email protected]"
+    const searchResults = (await search(query)).results
+
+    expect(searchResults[0].type === "user" && searchResults[0].email).toBe(
+      query,
+    )
+  })
+
+  test("retrieves user document by name-based query", async () => {
+    const query = "get the contact of Kim Calhoun"
+    const searchResults = (await search(query)).results
+
+    expect(searchResults[0].type === "user" && searchResults[0].name).toBe(
+      "Kim Calhoun",
+    )
+  })
+})
diff --git a/server/utils.ts b/server/utils.ts
@@ -4,6 +4,7 @@ import type { CookieOptions } from "hono/utils/cookie"
 import fs from "node:fs/promises"
 import { getLogger } from "@/logger"
 import { Subsystem } from "@/types"
+import { stopwords as englishStopwords } from "@orama/stopwords/english";
 
 const Logger = getLogger(Subsystem.Utils)
 
@@ -166,3 +167,14 @@ export const splitGroupedCitationsWithSpaces = (text: string): string => {
     },
   )
 }
+
+export const removeStopwords = (text: string) => {
+  const words = text.split(/\s+/)
+
+  // Filter out stopwords
+  const filteredWords = words.filter((word) => {
+    const cleanWord = word.toLowerCase().replace(/[^\w]/g, "")
+    return !englishStopwords.includes(cleanWord)
+  })
+  return filteredWords.join(" ")
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -324,4 +324,4 @@ export const routeTree = rootRoute @@
         }
       }
     }
-    ROUTE_MANIFEST_END */
+    ROUTE_MANIFEST_END */