-
Notifications
You must be signed in to change notification settings - Fork 927
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: playwright, spatial parsing, markdown for web search
Co-authored-by: Aaditya Sahay <[email protected]>
- Loading branch information
1 parent
50febad
commit 8c3db9a
Showing
33 changed files
with
1,719 additions
and
449 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,48 @@ | ||
import { Address6, Address4 } from "ip-address"; | ||
|
||
import dns from "node:dns"; | ||
|
||
export async function isURLLocal(URL: URL): Promise<boolean> { | ||
const isLocal = new Promise<boolean>((resolve, reject) => { | ||
dns.lookup(URL.hostname, (err, address, family) => { | ||
if (err) { | ||
reject(err); | ||
} | ||
if (family === 4) { | ||
const addr = new Address4(address); | ||
resolve(addr.isInSubnet(new Address4("127.0.0.0/8"))); | ||
} else if (family === 6) { | ||
const addr = new Address6(address); | ||
resolve( | ||
addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal() | ||
); | ||
} else { | ||
reject(new Error("Unknown IP family")); | ||
} | ||
const dnsLookup = (hostname: string): Promise<{ address: string; family: number }> => { | ||
return new Promise((resolve, reject) => { | ||
dns.lookup(hostname, (err, address, family) => { | ||
if (err) return reject(err); | ||
resolve({ address, family }); | ||
}); | ||
}); | ||
}; | ||
|
||
export async function isURLLocal(URL: URL): Promise<boolean> { | ||
const { address, family } = await dnsLookup(URL.hostname); | ||
|
||
if (family === 4) { | ||
const addr = new Address4(address); | ||
const localSubnet = new Address4("127.0.0.0/8"); | ||
return addr.isInSubnet(localSubnet); | ||
} | ||
|
||
if (family === 6) { | ||
const addr = new Address6(address); | ||
return addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal(); | ||
} | ||
|
||
throw Error("Unknown IP family"); | ||
} | ||
|
||
export function isURLStringLocal(url: string) { | ||
try { | ||
const urlObj = new URL(url); | ||
return isURLLocal(urlObj); | ||
} catch (e) { | ||
// assume local if URL parsing fails | ||
return true; | ||
} | ||
} | ||
|
||
return isLocal; | ||
// TODO: move this to a generic url helper | ||
export function isURL(url: string) { | ||
try { | ||
new URL(url); | ||
return true; | ||
} catch (e) { | ||
return false; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch"; | ||
import type { EmbeddingBackendModel } from "../../embeddingModels"; | ||
import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity"; | ||
import { MarkdownElementType, type MarkdownElement } from "../markdown/types"; | ||
import { stringifyMarkdownElement } from "../markdown/utils/stringify"; | ||
import { flattenTree } from "./tree"; | ||
|
||
const MIN_CHARS = 3000; | ||
const SOFT_MAX_CHARS = 8000; | ||
|
||
export async function findContextSources( | ||
sources: WebSearchScrapedSource[], | ||
prompt: string, | ||
embeddingModel: EmbeddingBackendModel | ||
) { | ||
const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree)); | ||
const markdownElems = sourcesMarkdownElems.flat(); | ||
|
||
const embeddings = await getSentenceSimilarity( | ||
embeddingModel, | ||
prompt, | ||
markdownElems | ||
.map(stringifyMarkdownElement) | ||
// Safety in case the stringified markdown elements are too long | ||
// but chunking should have happened earlier | ||
.map((elem) => elem.slice(0, embeddingModel.chunkCharLength)) | ||
); | ||
|
||
const topEmbeddings = embeddings | ||
.sort((a, b) => a.distance - b.distance) | ||
.filter((embedding) => markdownElems[embedding.idx].type !== MarkdownElementType.Header); | ||
|
||
let totalChars = 0; | ||
const selectedMarkdownElems = new Set<MarkdownElement>(); | ||
const selectedEmbeddings: number[][] = []; | ||
for (const embedding of topEmbeddings) { | ||
const elem = markdownElems[embedding.idx]; | ||
|
||
// Ignore elements that are too similar to already selected elements | ||
const tooSimilar = selectedEmbeddings.some( | ||
(selectedEmbedding) => innerProduct(selectedEmbedding, embedding.embedding) < 0.01 | ||
); | ||
if (tooSimilar) continue; | ||
|
||
// Add element | ||
if (!selectedMarkdownElems.has(elem)) { | ||
selectedMarkdownElems.add(elem); | ||
selectedEmbeddings.push(embedding.embedding); | ||
totalChars += elem.content.length; | ||
} | ||
|
||
// Add element's parent (header) | ||
if (elem.parent && !selectedMarkdownElems.has(elem.parent)) { | ||
selectedMarkdownElems.add(elem.parent); | ||
totalChars += elem.parent.content.length; | ||
} | ||
|
||
if (totalChars > SOFT_MAX_CHARS) break; | ||
if (totalChars > MIN_CHARS && embedding.distance > 0.25) break; | ||
} | ||
|
||
const contextSources = sourcesMarkdownElems | ||
.map<WebSearchUsedSource>((elems, idx) => { | ||
const sourceSelectedElems = elems.filter((elem) => selectedMarkdownElems.has(elem)); | ||
const context = sourceSelectedElems.map(stringifyMarkdownElement).join("\n"); | ||
const source = sources[idx]; | ||
return { ...source, context }; | ||
}) | ||
.filter((contextSource) => contextSource.context.length > 0); | ||
|
||
return contextSources; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import type { MarkdownElement } from "../markdown/types"; | ||
|
||
export function flattenTree(elem: MarkdownElement): MarkdownElement[] { | ||
if ("children" in elem) return [elem, ...elem.children.flatMap(flattenTree)]; | ||
return [elem]; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import { collapseString, sanitizeString } from "./utils/nlp"; | ||
import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify"; | ||
import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types"; | ||
import type { SerializedHTMLElement } from "../scrape/types"; | ||
|
||
type ConversionState = { | ||
defaultType: | ||
| MarkdownElementType.Paragraph | ||
| MarkdownElementType.BlockQuote | ||
| MarkdownElementType.UnorderedListItem | ||
| MarkdownElementType.OrderedListItem; | ||
listDepth: number; | ||
blockQuoteDepth: number; | ||
}; | ||
export function htmlElementToMarkdownElements( | ||
parent: HeaderElement, | ||
elem: SerializedHTMLElement | string, | ||
prevState: ConversionState = { | ||
defaultType: MarkdownElementType.Paragraph, | ||
listDepth: 0, | ||
blockQuoteDepth: 0, | ||
} | ||
): MarkdownElement | MarkdownElement[] { | ||
// Found text so create an element based on the previous state | ||
if (typeof elem === "string") { | ||
if (elem.trim().length === 0) return []; | ||
if ( | ||
prevState.defaultType === MarkdownElementType.UnorderedListItem || | ||
prevState.defaultType === MarkdownElementType.OrderedListItem | ||
) { | ||
return { | ||
parent, | ||
type: prevState.defaultType, | ||
content: elem, | ||
depth: prevState.listDepth, | ||
}; | ||
} | ||
if (prevState.defaultType === MarkdownElementType.BlockQuote) { | ||
return { | ||
parent, | ||
type: prevState.defaultType, | ||
content: elem, | ||
depth: prevState.blockQuoteDepth, | ||
}; | ||
} | ||
return { parent, type: prevState.defaultType, content: elem }; | ||
} | ||
|
||
const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph; | ||
|
||
// Update the state based on the current element | ||
const state: ConversionState = { ...prevState }; | ||
if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) { | ||
state.listDepth += 1; | ||
state.defaultType = | ||
type === MarkdownElementType.UnorderedList | ||
? MarkdownElementType.UnorderedListItem | ||
: MarkdownElementType.OrderedListItem; | ||
} | ||
if (type === MarkdownElementType.BlockQuote) { | ||
state.defaultType = MarkdownElementType.BlockQuote; | ||
state.blockQuoteDepth += 1; | ||
} | ||
|
||
// Headers | ||
if (type === MarkdownElementType.Header) { | ||
return { | ||
parent, | ||
type, | ||
level: Number(elem.tagName[1]), | ||
content: collapseString(stringifyHTMLElements(elem.content)), | ||
children: [], | ||
}; | ||
} | ||
|
||
// Code blocks | ||
if (type === MarkdownElementType.CodeBlock) { | ||
return { | ||
parent, | ||
type, | ||
content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)), | ||
}; | ||
} | ||
|
||
// Typical case, we want to flatten the DOM and only create elements when we see text | ||
return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state)); | ||
} | ||
|
||
export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] { | ||
return elements.reduce<MarkdownElement[]>((acc, elem) => { | ||
const last = acc[acc.length - 1]; | ||
if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) { | ||
last.content += elem.content; | ||
return acc; | ||
} | ||
return [...acc, elem]; | ||
}, []); | ||
} |
Oops, something went wrong.