From c06ecb0f48542dde1b80daf57fab4515226cf3dc Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 20 Jan 2025 13:39:21 +0000 Subject: [PATCH 1/8] refactor: move execution before citation renderers --- packages/myst-cli/src/process/mdast.ts | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index 750331710..c98408185 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -208,6 +208,20 @@ export async function transformMdast( // This needs to come after basic transformations since meta tags are added there propagateBlockDataToCode(session, vfile, mdast); + if (execute) { + const cachePath = path.join(session.buildPath(), 'execute'); + await kernelExecutionTransform(mdast, vfile, { + basePath: session.sourcePath(), + cache: new LocalDiskCache(cachePath), + sessionFactory: () => session.jupyterSessionManager(), + frontmatter: frontmatter, + ignoreCache: false, + errorIsFatal: false, + log: session.log, + }); + } + transformRenderInlineExpressions(mdast, vfile); + // Initialize citation renderers for this (non-bib) file cache.$citationRenderers[file] = await transformLinkedDOIs( session, @@ -225,20 +239,6 @@ export async function transformMdast( } // Combine file-specific citation renderers with project renderers from bib files const fileCitationRenderer = combineCitationRenderers(cache, ...rendererFiles); - - if (execute) { - const cachePath = path.join(session.buildPath(), 'execute'); - await kernelExecutionTransform(mdast, vfile, { - basePath: session.sourcePath(), - cache: new LocalDiskCache<(IExpressionResult | IOutput[])[]>(cachePath), - sessionFactory: () => session.jupyterSessionManager(), - frontmatter: frontmatter, - ignoreCache: false, - errorIsFatal: false, - log: session.log, - }); - } - transformRenderInlineExpressions(mdast, vfile); await transformOutputsToCache(session, mdast, kind, { minifyMaxCharacters }); transformFilterOutputStreams(mdast, vfile, frontmatter.settings); transformCitations(session, file, mdast, fileCitationRenderer, references); From 5f7f98e4b130f08e961ea347a9da6639575cca29 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Mon, 20 Jan 2025 13:40:57 +0000 Subject: [PATCH 2/8] refactor: run kernel execution before document plugins --- packages/myst-cli/src/process/mdast.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index c98408185..b6dbc19cf 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -198,11 +198,6 @@ export async function transformMdast( .use(abbreviationPlugin, { abbreviations: frontmatter.abbreviations }) .use(indexIdentifierPlugin) .use(enumerateTargetsPlugin, { state }); // This should be after math/container transforms - // Load custom transform plugins - session.plugins?.transforms.forEach((t) => { - if (t.stage !== 'document') return; - pipe.use(t.plugin, undefined, pluginUtils); - }); await pipe.run(mdast, vfile); // This needs to come after basic transformations since meta tags are added there @@ -222,6 +217,15 @@ export async function transformMdast( } transformRenderInlineExpressions(mdast, vfile); + // Run document plugins + const pluginPipe = unified(); + // Load custom transform plugins + session.plugins?.transforms.forEach((t) => { + if (t.stage !== 'document') return; + pluginPipe.use(t.plugin, undefined, pluginUtils); + }); + await pluginPipe.run(mdast, vfile); + // Initialize citation renderers for this (non-bib) file cache.$citationRenderers[file] = await transformLinkedDOIs( session, From ecbb5010ab4463335979371541257b684d926f05 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 21 Jan 2025 16:03:00 +0000 Subject: [PATCH 3/8] refactor: separate reduceOutputs from lifting --- packages/myst-cli/src/process/mdast.ts | 11 +- packages/myst-cli/src/transforms/outputs.ts | 300 +++++++++++--------- 2 files changed, 170 insertions(+), 141 deletions(-) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index b6dbc19cf..b53a9edd3 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -1,6 +1,6 @@ import path from 'node:path'; import { tic } from 'myst-cli-utils'; -import type { GenericParent, IExpressionResult, PluginUtils, References } from 'myst-common'; +import type { GenericParent, PluginUtils, References } from 'myst-common'; import { fileError, fileWarn, RuleId, slugToUrl } from 'myst-common'; import type { PageFrontmatter } from 'myst-frontmatter'; import { SourceFileKind, VERSION } from 'myst-spec-ext'; @@ -58,7 +58,8 @@ import { StaticFileTransformer, propagateBlockDataToCode, transformBanner, - reduceOutputs, + transformReduceOutputs, + transformLiftOutputs, transformPlaceholderImages, transformDeleteBase64UrlSource, transformWebp, @@ -385,10 +386,12 @@ export async function finalizeMdast( const vfile = new VFile(); // Collect errors on this file vfile.path = file; if (simplifyFigures) { - // Transform output nodes to images / text - reduceOutputs(session, mdast, file, imageWriteFolder, { + // Render out IOutput objects + transformReduceOutputs(session, mdast, file, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder, }); + // Lift rendered IOutput children + transformLiftOutputs(mdast); } transformOutputsToFile(session, mdast, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder, diff --git a/packages/myst-cli/src/transforms/outputs.ts b/packages/myst-cli/src/transforms/outputs.ts index 92990523d..7fa25fcc9 100644 --- a/packages/myst-cli/src/transforms/outputs.ts +++ b/packages/myst-cli/src/transforms/outputs.ts @@ -1,7 +1,7 @@ import fs from 'node:fs'; import { dirname, join, relative } from 'node:path'; import { computeHash } from 'myst-cli-utils'; -import type { Image, SourceFileKind, Output } from 'myst-spec-ext'; +import type { Image, SourceFileKind } from 'myst-spec-ext'; import { liftChildren, fileError, RuleId, fileWarn } from 'myst-common'; import type { GenericNode, GenericParent } from 'myst-common'; import type { ProjectSettings } from 'myst-frontmatter'; @@ -25,6 +25,53 @@ function getWriteDestination(hash: string, contentType: string, writeFolder: str return join(writeFolder, getFilename(hash, contentType)); } +const MARKDOWN_MIME_TYPE = 'text/markdown'; +const SUPPORTED_MARKDOWN_VARIANTS = ['Original', 'GFM', 'CommonMark', 'myst']; +const MIME_OUTPUT_TYPES = ['display_data', 'update_display_data', 'execute_result']; + +/** + * Extract the `variant` parameter from a Markdown MIME type + * + * @param mimeType MIME type of the form `text/markdown;FOO=BAR` + */ +function extractVariantParameter(mimeType: string): string | undefined { + const [variant] = Array.from(mimeType.matchAll(/;([^;]+)=([^;]+)/g)) + .filter(([name]) => name === 'variant') + .map((pair) => pair[1]); + return variant; +} + +export async function transformMarkdownOutputs( + mdast: GenericParent, + opts: { + parser: (content: string) => GenericParent; + }, +) { + const outputs = selectAll('output', mdast) as GenericNode[]; + outputs.forEach((output) => { + const rawOutput = output.data; + if (!MIME_OUTPUT_TYPES.includes(rawOutput?.output_type)) { + return; + } + // Find the most MyST-like Markdown (if any) + const [preferredEntry] = Object.entries(rawOutput.data as Record) + // Find only markdown outputs + .filter(([mimeType]) => mimeType.startsWith(MARKDOWN_MIME_TYPE)) + // Determine the Markdown variant + .map(([mimeType, data]) => [extractVariantParameter(mimeType), data]) + // Filter out non-MyST or plan Markdown + .filter(([variant]) => SUPPORTED_MARKDOWN_VARIANTS.includes(variant)) + .sort((left) => (left[0] === undefined ? +1 : -1)); + + // Process Markdown + if (preferredEntry !== undefined) { + const data = preferredEntry[1]; + const outputMdast = opts.parser(data as string); + output.children = outputMdast.children; + } + }); +} + /** * Traverse all output nodes, minify their content, and cache on the session */ @@ -34,28 +81,18 @@ export async function transformOutputsToCache( kind: SourceFileKind, opts?: { minifyMaxCharacters?: number }, ) { - const outputsNodes = selectAll('outputs', mdast) as GenericNode[]; + const outputs = selectAll('output', mdast) as GenericNode[]; + if (!outputs.length) return; const cache = castSession(session); await Promise.all( - outputsNodes - // Ignore outputs that are hidden - .filter((outputs) => outputs.visibility !== 'remove') - // Pull out children - .map((outputs) => outputs.children as Output[]) - .flat() - // Filter outputs with no data - // TODO: can this ever occur? - .filter((output) => (output as any).jupyter_data !== undefined) - // Minify output data + outputs + .filter((output) => output.visibility !== 'remove') .map(async (output) => { - [(output as any).jupyter_data] = await minifyCellOutput( - [(output as any).jupyter_data] as IOutput[], - cache.$outputs, - { - computeHash, - maxCharacters: opts?.minifyMaxCharacters, - }, - ); + const [tmpData] = await minifyCellOutput([output.data] as IOutput[], cache.$outputs, { + computeHash, + maxCharacters: opts?.minifyMaxCharacters, + }); + output.data = tmpData; }), ); } @@ -86,10 +123,13 @@ export function transformFilterOutputStreams( const blockRemoveStderr = tags.includes('remove-stderr'); const blockRemoveStdout = tags.includes('remove-stdout'); const outputs = selectAll('output', block) as GenericNode[]; - outputs - .filter((output) => { - const data = output.jupyter_data; - + // There should be only one output in the block + outputs.forEach((output) => { + if (!output.data) { + return; + } + console.log(JSON.stringify(output.data, null, 2)); + output.data = [output.data].filter((data: IStream | MinifiedMimeOutput) => { if ( (stderr !== 'show' || blockRemoveStderr) && data.output_type === 'stream' && @@ -108,7 +148,7 @@ export function transformFilterOutputStreams( }, ); } - return doRemove; + return !doRemove; } if ( (stdout !== 'show' || blockRemoveStdout) && @@ -128,7 +168,7 @@ export function transformFilterOutputStreams( }, ); } - return doRemove; + return !doRemove; } if ( mpl !== 'show' && @@ -153,15 +193,12 @@ export function transformFilterOutputStreams( }, ); } - return doRemove; + return !doRemove; } - return false; - }) - .forEach((output) => { - output.type = '__delete__'; - }); + return true; + })[0]; + }); }); - remove(mdast, { cascade: false }, '__delete__'); } function writeCachedOutputToFile( @@ -207,19 +244,17 @@ export function transformOutputsToFile( const outputs = selectAll('output', mdast) as GenericNode[]; const cache = castSession(session); - outputs - .filter((output) => !!output.jupyter_data) - .forEach((node) => { - // TODO: output-refactoring -- drop to single output in future - walkOutputs([node.jupyter_data], (obj) => { - const { hash } = obj; - if (!hash || !cache.$outputs[hash]) return undefined; - obj.path = writeCachedOutputToFile(session, hash, cache.$outputs[hash], writeFolder, { - ...opts, - node, - }); + outputs.forEach((node) => { + const rawOutputs = node.data === undefined ? [] : [node.data]; + walkOutputs(rawOutputs, (obj) => { + const { hash } = obj; + if (!hash || !cache.$outputs[hash]) return undefined; + obj.path = writeCachedOutputToFile(session, hash, cache.$outputs[hash], writeFolder, { + ...opts, + node, }); }); + }); } /** @@ -237,114 +272,105 @@ function isPreferredOutputType(newType: string, existingType: string) { if (newType === 'text/html') return true; return false; } - +/** + * Lift the children of outputs nodes into their parents + */ +export function transformLiftOutputs(mdast: GenericParent) { + const outputsNodes = selectAll('outputs', mdast) as GenericNode[]; + outputsNodes.forEach((node) => { + // Lift all output nodes + const outputNodes = selectAll('output', node) as GenericNode[]; + outputNodes.forEach((outputNode) => { + outputNode.type = '__lift__'; + }); + // Lift the outputs node + node.type = '__lift__'; + }); + liftChildren(mdast, '__lift__'); +} /** * Convert output nodes with minified content to image or code * * This writes outputs of type image to file, modifies outputs of type * text to a code node, and removes other output types. */ -export function reduceOutputs( +export function transformReduceOutputs( session: ISession, mdast: GenericParent, file: string, writeFolder: string, opts?: { altOutputFolder?: string; vfile?: VFile }, ) { - const outputsNodes = selectAll('outputs', mdast) as GenericNode[]; + const outputs = selectAll('output', mdast) as GenericNode[]; const cache = castSession(session); - outputsNodes.forEach((outputsNode) => { - const outputs = outputsNode.children as GenericNode[]; - - outputs.forEach((outputNode) => { - if (outputNode.visibility === 'remove' || outputNode.visibility === 'hide') { - // Hidden nodes should not show up in simplified outputs for static export - outputNode.type = '__delete__'; - return; - } - if (!outputNode.jupyter_data && !outputNode.children?.length) { - outputNode.type = '__delete__'; - return; - } - // Lift the `output` node into `Outputs` - outputNode.type = '__lift__'; - - // If the output already has children, we don't need to do anything - if (outputNode.children?.length) { - return; - } - - // Find a preferred IOutput type to render into the AST - const selectedOutputs: { content_type: string; hash: string }[] = []; - if (outputNode.jupyter_data) { - const output = outputNode.jupyter_data; - - let selectedOutput: { content_type: string; hash: string } | undefined; - walkOutputs([output], (obj: any) => { - const { output_type, content_type, hash } = obj; - if (!hash) return undefined; - if (!selectedOutput || isPreferredOutputType(content_type, selectedOutput.content_type)) { - if (['error', 'stream'].includes(output_type)) { - selectedOutput = { content_type: 'text/plain', hash }; - } else if (typeof content_type === 'string') { - if ( - content_type.startsWith('image/') || - content_type === 'text/plain' || - content_type === 'text/html' - ) { - selectedOutput = { content_type, hash }; - } + outputs.forEach((node) => { + if (!node.data) { + return; + } + const selectedOutputs: { content_type: string; hash: string }[] = []; + const rawOutputs = node.data === undefined ? [] : [node.data]; + rawOutputs.forEach((output: MinifiedOutput) => { + let selectedOutput: { content_type: string; hash: string } | undefined; + walkOutputs([output], (obj: any) => { + const { output_type, content_type, hash } = obj; + if (!hash) return undefined; + if (!selectedOutput || isPreferredOutputType(content_type, selectedOutput.content_type)) { + if (['error', 'stream'].includes(output_type)) { + selectedOutput = { content_type: 'text/plain', hash }; + } else if (typeof content_type === 'string') { + if ( + content_type.startsWith('image/') || + content_type === 'text/plain' || + content_type === 'text/html' + ) { + selectedOutput = { content_type, hash }; } } - }); - if (selectedOutput) selectedOutputs.push(selectedOutput); - } - const children: (Image | GenericNode)[] = selectedOutputs - .map((output): Image | GenericNode | GenericNode[] | undefined => { - const { content_type, hash } = output ?? {}; - if (!hash || !cache.$outputs[hash]) return undefined; - if (content_type === 'text/html') { - const htmlTree = { - type: 'root', - children: [ - { - type: 'html', - value: cache.$outputs[hash][0], - }, - ], - }; - htmlTransform(htmlTree); - return htmlTree.children; - } else if (content_type.startsWith('image/')) { - const path = writeCachedOutputToFile(session, hash, cache.$outputs[hash], writeFolder, { - ...opts, - node: outputNode, - }); - if (!path) return undefined; - const relativePath = relative(dirname(file), path); - return { - type: 'image', - data: { type: 'output' }, - url: relativePath, - urlSource: relativePath, - }; - } else if (content_type === 'text/plain' && cache.$outputs[hash]) { - const [content] = cache.$outputs[hash]; - return { - type: 'code', - data: { type: 'output' }, - value: stripAnsi(content), - }; - } - return undefined; - }) - .flat() - .filter((output): output is Image | GenericNode => !!output); - outputNode.children = children; + } + }); + if (selectedOutput) selectedOutputs.push(selectedOutput); }); - // Lift the `outputs` node - outputsNode.type = '__lift__'; + const children: (Image | GenericNode)[] = selectedOutputs + .map((output): Image | GenericNode | GenericNode[] | undefined => { + const { content_type, hash } = output ?? {}; + if (!hash || !cache.$outputs[hash]) return undefined; + if (content_type === 'text/html') { + const htmlTree = { + type: 'root', + children: [ + { + type: 'html', + value: cache.$outputs[hash][0], + }, + ], + }; + htmlTransform(htmlTree); + return htmlTree.children; + } else if (content_type.startsWith('image/')) { + const path = writeCachedOutputToFile(session, hash, cache.$outputs[hash], writeFolder, { + ...opts, + node, + }); + if (!path) return undefined; + const relativePath = relative(dirname(file), path); + return { + type: 'image', + data: { type: 'output' }, + url: relativePath, + urlSource: relativePath, + }; + } else if (content_type === 'text/plain' && cache.$outputs[hash]) { + const [content] = cache.$outputs[hash]; + return { + type: 'code', + data: { type: 'output' }, + value: stripAnsi(content), + }; + } + return undefined; + }) + .flat() + .filter((output): output is Image | GenericNode => !!output); + node.children = children; }); - remove(mdast, '__delete__'); - liftChildren(mdast, '__lift__'); } From eb876da0d2005dc5f8bf6872e25bd03923c498c4 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Tue, 21 Jan 2025 17:00:36 +0000 Subject: [PATCH 4/8] refactor: prepare for MD parsing --- packages/myst-cli/src/process/mdast.ts | 7 +- packages/myst-cli/src/transforms/outputs.ts | 80 +++++++++++---------- 2 files changed, 43 insertions(+), 44 deletions(-) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index b53a9edd3..d71ebd827 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -1,6 +1,6 @@ import path from 'node:path'; import { tic } from 'myst-cli-utils'; -import type { GenericParent, PluginUtils, References } from 'myst-common'; +import type { GenericParent, IExpressionResult, PluginUtils, References } from 'myst-common'; import { fileError, fileWarn, RuleId, slugToUrl } from 'myst-common'; import type { PageFrontmatter } from 'myst-frontmatter'; import { SourceFileKind, VERSION } from 'myst-spec-ext'; @@ -59,7 +59,6 @@ import { propagateBlockDataToCode, transformBanner, transformReduceOutputs, - transformLiftOutputs, transformPlaceholderImages, transformDeleteBase64UrlSource, transformWebp, @@ -386,12 +385,10 @@ export async function finalizeMdast( const vfile = new VFile(); // Collect errors on this file vfile.path = file; if (simplifyFigures) { - // Render out IOutput objects + // Transform output nodes to images / text transformReduceOutputs(session, mdast, file, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder, }); - // Lift rendered IOutput children - transformLiftOutputs(mdast); } transformOutputsToFile(session, mdast, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder, diff --git a/packages/myst-cli/src/transforms/outputs.ts b/packages/myst-cli/src/transforms/outputs.ts index 7fa25fcc9..8269cbde2 100644 --- a/packages/myst-cli/src/transforms/outputs.ts +++ b/packages/myst-cli/src/transforms/outputs.ts @@ -41,35 +41,21 @@ function extractVariantParameter(mimeType: string): string | undefined { return variant; } -export async function transformMarkdownOutputs( - mdast: GenericParent, - opts: { - parser: (content: string) => GenericParent; - }, -) { - const outputs = selectAll('output', mdast) as GenericNode[]; - outputs.forEach((output) => { - const rawOutput = output.data; - if (!MIME_OUTPUT_TYPES.includes(rawOutput?.output_type)) { - return; - } - // Find the most MyST-like Markdown (if any) - const [preferredEntry] = Object.entries(rawOutput.data as Record) - // Find only markdown outputs - .filter(([mimeType]) => mimeType.startsWith(MARKDOWN_MIME_TYPE)) - // Determine the Markdown variant - .map(([mimeType, data]) => [extractVariantParameter(mimeType), data]) - // Filter out non-MyST or plan Markdown - .filter(([variant]) => SUPPORTED_MARKDOWN_VARIANTS.includes(variant)) - .sort((left) => (left[0] === undefined ? +1 : -1)); +function preferredMarkdownMIMEType( + mimeType: string, +): { variant?: string; mimeType: string } | undefined { + if (!mimeType.startsWith(MARKDOWN_MIME_TYPE)) { + return; + } + const variant = extractVariantParameter(mimeType); + if (!variant) { + return { mimeType }; + } + if (SUPPORTED_MARKDOWN_VARIANTS.includes(variant)) { + return { mimeType, variant }; + } - // Process Markdown - if (preferredEntry !== undefined) { - const data = preferredEntry[1]; - const outputMdast = opts.parser(data as string); - output.children = outputMdast.children; - } - }); + return; } /** @@ -266,11 +252,15 @@ export function transformOutputsToFile( * If the new and existing types are the same, always just keep existing. */ function isPreferredOutputType(newType: string, existingType: string) { - if (existingType.startsWith('image/')) return false; - if (newType.startsWith('image')) return true; - if (existingType === 'text/html') return false; - if (newType === 'text/html') return true; - return false; + const newIndex = PREFERRED_MIME_TYPES.findIndex((fn) => fn(newType)); + const existingIndex = PREFERRED_MIME_TYPES.findIndex((fn) => fn(existingType)); + if (newIndex === -1) { + return false; + } else if (existingIndex === -1) { + return true; + } else { + return newIndex < existingIndex; + } } /** * Lift the children of outputs nodes into their parents @@ -288,6 +278,18 @@ export function transformLiftOutputs(mdast: GenericParent) { }); liftChildren(mdast, '__lift__'); } + +type TestFunction = (mime: string) => boolean; + +export const PREFERRED_MIME_TYPES: TestFunction[] = [ + // Markdown (any variant) + (mime) => !!preferredMarkdownMIMEType(mime), + // Image (any type) + (mime) => mime.startsWith('image'), + (mime) => mime === 'text/html', + (mime) => mime === 'text/plain', +]; + /** * Convert output nodes with minified content to image or code * @@ -315,14 +317,14 @@ export function transformReduceOutputs( const { output_type, content_type, hash } = obj; if (!hash) return undefined; if (!selectedOutput || isPreferredOutputType(content_type, selectedOutput.content_type)) { + // Check output type if (['error', 'stream'].includes(output_type)) { selectedOutput = { content_type: 'text/plain', hash }; - } else if (typeof content_type === 'string') { - if ( - content_type.startsWith('image/') || - content_type === 'text/plain' || - content_type === 'text/html' - ) { + } + // Check content type + else if (typeof content_type === 'string') { + const preferredMime = PREFERRED_MIME_TYPES.some((matcher) => !!matcher(content_type)); + if (preferredMime) { selectedOutput = { content_type, hash }; } } From db8033822e0c5251cd433d8399cd4cd499984c67 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 22 Jan 2025 13:29:43 +0000 Subject: [PATCH 5/8] refactor: more readability --- packages/myst-execute/src/cache.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/myst-execute/src/cache.ts b/packages/myst-execute/src/cache.ts index af6f25f91..8cb576e8f 100644 --- a/packages/myst-execute/src/cache.ts +++ b/packages/myst-execute/src/cache.ts @@ -36,11 +36,13 @@ export class LocalDiskCache implements ICache { if (!existsSync(keyPath)) { return undefined; } - return JSON.parse(readFileSync(keyPath, { encoding: 'utf8' })); + const data = readFileSync(keyPath, { encoding: 'utf8' }); + return JSON.parse(data); } set(key: string, item: T) { + const data = JSON.stringify(item); const keyPath = this._makeKeyPath(key); - return writeFileSync(keyPath, JSON.stringify(item), { encoding: 'utf8' }); + return writeFileSync(keyPath, data, { encoding: 'utf8' }); } } From c7ce681d78d7da90b2a2e99c81c3f18e89ff5c00 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 22 Jan 2025 13:39:18 +0000 Subject: [PATCH 6/8] wip: use nbtx types for outputs Minify outputs early --- package-lock.json | 4 +- packages/myst-cli/src/process/file.ts | 3 +- packages/myst-cli/src/process/mdast.ts | 3 +- packages/myst-cli/src/process/notebook.ts | 32 +++++++---- packages/myst-cli/src/transforms/outputs.ts | 6 +-- packages/myst-execute/package.json | 3 +- packages/myst-execute/src/execute.ts | 59 +++++++++++++++------ packages/myst-spec-ext/package.json | 3 +- packages/myst-spec-ext/src/types.ts | 3 +- 9 files changed, 78 insertions(+), 38 deletions(-) diff --git a/package-lock.json b/package-lock.json index 736e8d550..518b53bfd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15751,6 +15751,7 @@ "chalk": "^5.2.0", "myst-cli-utils": "^2.0.11", "myst-common": "^1.7.6", + "nbtx": "^0.2.3", "node-fetch": "^3.3.0", "unist-util-select": "^4.0.3", "vfile": "^5.3.7", @@ -15965,7 +15966,8 @@ "version": "1.7.6", "license": "MIT", "dependencies": { - "myst-spec": "^0.0.5" + "myst-spec": "^0.0.5", + "nbtx": "^0.2.3" } }, "packages/myst-templates": { diff --git a/packages/myst-cli/src/process/file.ts b/packages/myst-cli/src/process/file.ts index d2e93f730..d78db3263 100644 --- a/packages/myst-cli/src/process/file.ts +++ b/packages/myst-cli/src/process/file.ts @@ -36,6 +36,7 @@ type LoadFileOptions = { preFrontmatter?: Record; keepTitleNode?: boolean; kind?: SourceFileKind; + minifyMaxCharacters?: number; }; export type LoadFileResult = { @@ -85,7 +86,7 @@ export async function loadNotebookFile( mdast, frontmatter: nbFrontmatter, widgets, - } = await processNotebookFull(session, file, content); + } = await processNotebookFull(session, file, content, opts?.minifyMaxCharacters); const { frontmatter: cellFrontmatter, identifiers } = getPageFrontmatter( session, mdast, diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index d71ebd827..763e4eb50 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -76,7 +76,6 @@ import { combineCitationRenderers } from './citations.js'; import { bibFilesInDir, selectFile } from './file.js'; import { parseMyst } from './myst.js'; import { kernelExecutionTransform, LocalDiskCache } from 'myst-execute'; -import type { IOutput } from '@jupyterlab/nbformat'; import { rawDirectiveTransform } from '../transforms/raw.js'; const LINKS_SELECTOR = 'link,card,linkBlock'; @@ -213,6 +212,8 @@ export async function transformMdast( ignoreCache: false, errorIsFatal: false, log: session.log, + outputCache: cache.$outputs, + minifyMaxCharacters, }); } transformRenderInlineExpressions(mdast, vfile); diff --git a/packages/myst-cli/src/process/notebook.ts b/packages/myst-cli/src/process/notebook.ts index 59257a5dc..8fd040451 100644 --- a/packages/myst-cli/src/process/notebook.ts +++ b/packages/myst-cli/src/process/notebook.ts @@ -7,14 +7,15 @@ import type { ICell, IMimeBundle, INotebookContent, - INotebookMetadata, IOutput, MultilineString, } from '@jupyterlab/nbformat'; -import { CELL_TYPES, ensureString } from 'nbtx'; +import { CELL_TYPES, ensureString, minifyCellOutput } from 'nbtx'; import { VFile } from 'vfile'; import { logMessagesFromVFile } from '../utils/logging.js'; import type { ISession } from '../session/types.js'; +import { castSession } from '../session/cache.js'; +import { computeHash } from 'myst-cli-utils'; import { BASE64_HEADER_SPLIT } from '../transforms/images.js'; import { parseMyst } from './myst.js'; import type { Code, InlineExpression } from 'myst-spec-ext'; @@ -88,8 +89,10 @@ export async function processNotebookFull( session: ISession, file: string, content: string, + minifyMaxCharacters?: number, ): Promise<{ mdast: GenericParent; frontmatter: PageFrontmatter; widgets: Record }> { const { log } = session; + const cache = castSession(session); const { metadata, cells } = JSON.parse(content) as INotebookContent; // notebook will be empty, use generateNotebookChildren, generateNotebookOrder here if we want to populate those @@ -165,15 +168,22 @@ export async function processNotebookFull( value: ensureString(cell.source), }; - const outputsChildren = (cell.outputs as IOutput[]).map((output) => { - // Embed outputs in an output block - const result = { - type: 'output', - jupyter_data: output, - children: [], - }; - return result; - }); + const outputsChildren = await Promise.all( + (cell.outputs as IOutput[]).map(async (output) => { + // Minify the output + const [minifiedOutput] = await minifyCellOutput([output] as IOutput[], cache.$outputs, { + computeHash, + maxCharacters: minifyMaxCharacters, + }); + // Embed outputs in an output block + const result = { + type: 'output', + jupyter_data: minifiedOutput, + children: [], + }; + return result; + }), + ); const outputs = { type: 'outputs', id: nanoid(), diff --git a/packages/myst-cli/src/transforms/outputs.ts b/packages/myst-cli/src/transforms/outputs.ts index 8269cbde2..f9f3033f2 100644 --- a/packages/myst-cli/src/transforms/outputs.ts +++ b/packages/myst-cli/src/transforms/outputs.ts @@ -7,7 +7,6 @@ import type { GenericNode, GenericParent } from 'myst-common'; import type { ProjectSettings } from 'myst-frontmatter'; import { htmlTransform } from 'myst-transforms'; import stripAnsi from 'strip-ansi'; -import { remove } from 'unist-util-remove'; import { selectAll } from 'unist-util-select'; import type { VFile } from 'vfile'; import type { IOutput, IStream } from '@jupyterlab/nbformat'; @@ -27,7 +26,6 @@ function getWriteDestination(hash: string, contentType: string, writeFolder: str const MARKDOWN_MIME_TYPE = 'text/markdown'; const SUPPORTED_MARKDOWN_VARIANTS = ['Original', 'GFM', 'CommonMark', 'myst']; -const MIME_OUTPUT_TYPES = ['display_data', 'update_display_data', 'execute_result']; /** * Extract the `variant` parameter from a Markdown MIME type @@ -279,9 +277,7 @@ export function transformLiftOutputs(mdast: GenericParent) { liftChildren(mdast, '__lift__'); } -type TestFunction = (mime: string) => boolean; - -export const PREFERRED_MIME_TYPES: TestFunction[] = [ +export const PREFERRED_MIME_TYPES: Array<(mime: string) => boolean> = [ // Markdown (any variant) (mime) => !!preferredMarkdownMIMEType(mime), // Image (any type) diff --git a/packages/myst-execute/package.json b/packages/myst-execute/package.json index af5718e7b..fc4fe4dda 100644 --- a/packages/myst-execute/package.json +++ b/packages/myst-execute/package.json @@ -39,7 +39,8 @@ "node-fetch": "^3.3.0", "unist-util-select": "^4.0.3", "vfile": "^5.3.7", - "which": "^4.0.0" + "which": "^4.0.0", + "nbtx": "^0.2.3" }, "devDependencies": { "@jupyterlab/nbformat": "^3.5.2", diff --git a/packages/myst-execute/src/execute.ts b/packages/myst-execute/src/execute.ts index ab2d5bc1a..e9dfe4a6e 100644 --- a/packages/myst-execute/src/execute.ts +++ b/packages/myst-execute/src/execute.ts @@ -1,4 +1,5 @@ import { select, selectAll } from 'unist-util-select'; +import { computeHash } from 'myst-cli-utils'; import type { Logger } from 'myst-cli-utils'; import type { PageFrontmatter, KernelSpec } from 'myst-frontmatter'; import type { Kernel, KernelMessage, Session, SessionManager } from '@jupyterlab/services'; @@ -12,6 +13,8 @@ import assert from 'node:assert'; import { createHash } from 'node:crypto'; import type { Plugin } from 'unified'; import type { ICache } from './cache.js'; +import { minifyCellOutput } from 'nbtx'; +import type { MinifiedContentCache } from 'nbtx'; /** * Interpret an IOPub message as an IOutput object @@ -206,7 +209,7 @@ function isInlineExpression(node: GenericNode): node is InlineExpression { async function computeExecutableNodes( kernel: Kernel.IKernelConnection, nodes: (CodeBlock | InlineExpression)[], - opts: { vfile: VFile }, + vfile: VFile, ): Promise<{ results: (IOutput[] | IExpressionResult)[]; errorOccurred: boolean; @@ -215,9 +218,11 @@ async function computeExecutableNodes( const results: (IOutput[] | IExpressionResult)[] = []; for (const matchedNode of nodes) { + // An executable cell if (isCellBlock(matchedNode)) { // Pull out code to execute const code = select('code', matchedNode) as Code; + // Invoke kernel const { status, outputs } = await executeCode(kernel, code.value); // Cache result results.push(outputs); @@ -225,12 +230,14 @@ async function computeExecutableNodes( // Check for errors const allowErrors = codeBlockRaisesException(matchedNode); if (status === 'error' && !allowErrors) { + // Join all error tracebacks into a single newline-delimited string const errorMessage = outputs .map((item) => item.traceback) + .filter((item) => !!item) .flat() .join('\n'); fileError( - opts.vfile, + vfile, `An exception occurred during code execution, halting further execution:\n\n${errorMessage}`, { node: matchedNode, @@ -248,7 +255,7 @@ async function computeExecutableNodes( if (status === 'error') { const errorMessage = (result as IExpressionError).traceback.join('\n'); fileError( - opts.vfile, + vfile, `An exception occurred during expression evaluation, halting further execution:\n\n${errorMessage}`, { node: matchedNode }, ); @@ -273,9 +280,11 @@ async function computeExecutableNodes( * @param nodes executable MDAST nodes * @param computedResult computed results for each node */ -function applyComputedOutputsToNodes( +async function applyComputedOutputsToNodes( nodes: (CodeBlock | InlineExpression)[], computedResult: (IOutput[] | IExpressionResult)[], + outputCache: MinifiedContentCache, + minifyMaxCharacters?: number, ) { for (const matchedNode of nodes) { // Pull out the result for this node @@ -286,9 +295,17 @@ function applyComputedOutputsToNodes( // Pull out outputs to set data const outputs = select('outputs', matchedNode) as Outputs; // Ensure that whether this fails or succeeds, we write to `children` (e.g. due to a kernel error) - outputs.children = rawOutputData.map((data) => { - return { type: 'output', children: [], jupyter_data: data as any }; - }); + outputs.children = await Promise.all( + rawOutputData.map(async (data) => { + // Minify the output + const [minifiedOutput] = await minifyCellOutput([data], outputCache, { + computeHash, + maxCharacters: minifyMaxCharacters, + }); + const output: Output = { type: 'output', children: [], jupyter_data: minifiedOutput }; + return output; + }), + ); } else if (isInlineExpression(matchedNode)) { const rawOutputData = thisResult as Record | undefined; // Set data of expression to the result, or empty if we don't have one @@ -303,11 +320,14 @@ function applyComputedOutputsToNodes( export type Options = { basePath: string; cache: ICache<(IExpressionResult | IOutput[])[]>; + outputCache: MinifiedContentCache; sessionFactory: () => Promise; frontmatter: PageFrontmatter; + // Ignore output cache and forcibly re-execute ignoreCache?: boolean; errorIsFatal?: boolean; log?: Logger; + minifyMaxCharacters?: number; }; /** @@ -318,6 +338,8 @@ export type Options = { * @param opts */ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile, opts: Options) { + const { basePath, cache, sessionFactory, frontmatter, outputCache, minifyMaxCharacters } = opts; + const log = opts.log ?? console; // Pull out code-like nodes @@ -336,7 +358,7 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile } // We need the kernelspec to proceed - if (opts.frontmatter.kernelspec === undefined) { + if (frontmatter.kernelspec === undefined) { return fileError( vfile, `Notebook does not declare the necessary 'kernelspec' frontmatter key required for execution`, @@ -344,8 +366,8 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile } // See if we already cached this execution - const cacheKey = buildCacheKey(opts.frontmatter.kernelspec, executableNodes); - let cachedResults: (IExpressionResult | IOutput[])[] | undefined = opts.cache.get(cacheKey); + const cacheKey = buildCacheKey(frontmatter.kernelspec, executableNodes); + let cachedResults: (IExpressionResult | IOutput[])[] | undefined = cache.get(cacheKey); // Do we need to re-execute notebook? if (opts.ignoreCache || cachedResults === undefined) { @@ -354,7 +376,7 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile opts.ignoreCache ? '[cache ignored]' : '[no execution cache found]' }`, ); - const sessionManager = await opts.sessionFactory(); + const sessionManager = await sessionFactory(); // Do we not have a working session? if (sessionManager === undefined) { fileError(vfile, `Could not load Jupyter session manager to run executable nodes`, { @@ -365,11 +387,11 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile else { let sessionConnection: Session.ISessionConnection | undefined; const sessionOpts = { - path: path.relative(opts.basePath, vfile.path), + path: path.relative(basePath, vfile.path), type: 'notebook', name: path.basename(vfile.path), kernel: { - name: opts.frontmatter.kernelspec.name, + name: frontmatter.kernelspec.name, }, }; await sessionManager @@ -387,11 +409,11 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile const { results, errorOccurred } = await computeExecutableNodes( conn.kernel, executableNodes, - { vfile }, + vfile, ); // Populate cache if things were successful if (!errorOccurred) { - opts.cache.set(cacheKey, results); + cache.set(cacheKey, results); } // Refer to these computed results cachedResults = results; @@ -406,7 +428,12 @@ export async function kernelExecutionTransform(tree: GenericParent, vfile: VFile if (cachedResults) { // Apply results to tree - applyComputedOutputsToNodes(executableNodes, cachedResults); + await applyComputedOutputsToNodes( + executableNodes, + cachedResults, + outputCache, + minifyMaxCharacters, + ); } } diff --git a/packages/myst-spec-ext/package.json b/packages/myst-spec-ext/package.json index 25e793419..b8b2b08b8 100644 --- a/packages/myst-spec-ext/package.json +++ b/packages/myst-spec-ext/package.json @@ -32,6 +32,7 @@ "url": "https://github.com/jupyter-book/mystmd/issues" }, "dependencies": { - "myst-spec": "^0.0.5" + "myst-spec": "^0.0.5", + "nbtx": "^0.2.3" } } diff --git a/packages/myst-spec-ext/src/types.ts b/packages/myst-spec-ext/src/types.ts index 3730cab53..85020ff8a 100644 --- a/packages/myst-spec-ext/src/types.ts +++ b/packages/myst-spec-ext/src/types.ts @@ -20,6 +20,7 @@ import type { CrossReference as SpecCrossReference, Link as SpecLink, } from 'myst-spec'; +import type { MinifiedOutput } from 'nbtx'; type Visibility = 'show' | 'hide' | 'remove'; @@ -255,7 +256,7 @@ export type Output = Node & Target & { type: 'output'; children: (FlowContent | ListContent | PhrasingContent)[]; - jupyter_data: any; // TODO: set this to IOutput + jupyter_data: MinifiedOutput; }; export type Outputs = Node & From 89212a34ebdc706b7e6119ab34f6f74614e64c20 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 22 Jan 2025 15:07:15 +0000 Subject: [PATCH 7/8] wip: directly populate cache with outputs --- packages/myst-cli/src/process/mdast.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index 763e4eb50..34d6582cb 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -52,7 +52,6 @@ import { transformImageFormats, transformLinkedDOIs, transformLinkedRORs, - transformOutputsToCache, transformRenderInlineExpressions, transformThumbnail, StaticFileTransformer, @@ -244,7 +243,6 @@ export async function transformMdast( } // Combine file-specific citation renderers with project renderers from bib files const fileCitationRenderer = combineCitationRenderers(cache, ...rendererFiles); - await transformOutputsToCache(session, mdast, kind, { minifyMaxCharacters }); transformFilterOutputStreams(mdast, vfile, frontmatter.settings); transformCitations(session, file, mdast, fileCitationRenderer, references); await unified() From 0f6e96e04a653da52758ba3de406234bb3369a7b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 22 Jan 2025 15:35:41 +0000 Subject: [PATCH 8/8] fix: restore output lifting --- packages/myst-cli/src/process/mdast.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index 34d6582cb..d8eceb6a7 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -58,6 +58,7 @@ import { propagateBlockDataToCode, transformBanner, transformReduceOutputs, + transformLiftOutputs, transformPlaceholderImages, transformDeleteBase64UrlSource, transformWebp, @@ -388,6 +389,7 @@ export async function finalizeMdast( transformReduceOutputs(session, mdast, file, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder, }); + transformLiftOutputs(mdast); } transformOutputsToFile(session, mdast, imageWriteFolder, { altOutputFolder: simplifyFigures ? undefined : imageAltOutputFolder,