storybookjs · valentinpalkovic · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/eval/README.md b/eval/README.md
@@ -124,8 +124,9 @@ Each experiment produces:
 - **Build success**: Can the project build without errors?
 - **Type check**: TypeScript compilation errors count
 - **Lint**: ESLint errors count
-- **Tests**: Storybook test results (passed/failed)
+- **Tests**: Storybook story results (passed/failed) including play functions
 - **Accessibility**: Axe violations count
+- **Coverage**: Vite/Vitest coverage summary (lines/statements/branches/functions)
 - **Cost**: API usage cost in USD
 - **Duration**: Total time and API time in seconds
 - **Turns**: Number of agent conversation turns
@@ -145,7 +146,13 @@ Complete metrics from execution and evaluation:
 	"typeCheckErrors": 0,
 	"lintErrors": 0,
 	"test": { "passed": 3, "failed": 0 },
-	"a11y": { "violations": 1 }
+	"a11y": { "violations": 1 },
+	"coverage": {
+		"lines": 87.5,
+		"statements": 86.9,
+		"branches": 75.0,
+		"functions": 80.0
+	}
 }
 ```
 

diff --git a/eval/eval.ts b/eval/eval.ts
@@ -99,7 +99,7 @@
 try {
 	await teardownExperiment(experimentArgs);
 } catch (error) {
 	p.log.error(`Failed to teardown experiment: ${error}`);
 	// Continue with evaluation despite teardown failure
 }

@@ -145,6 +145,15 @@
 	);
 }
 
+const cov = evaluationSummary.coverage;
+const formatCov = (v: number | null | undefined) =>
+	typeof v === 'number' ? `${v}%` : '–';
+p.log.message(
+	cov
+		? `📊 Coverage: lines ${formatCov(cov.lines)}, statements ${formatCov(cov.statements)}, branches ${formatCov(cov.branches)}, functions ${formatCov(cov.functions)}`
+		: '📊 Coverage: (not collected)',
+);
+
 p.log.message(
 	`⏱️  Duration: ${promptSummary.duration}s (API: ${promptSummary.durationApi}s)`,
 );

diff --git a/eval/lib/context-utils.ts b/eval/lib/context-utils.ts
@@ -0,0 +1,5 @@
+import type { Context } from '../types.ts';
+
+export function isDevEvaluation(context: Context): boolean {
+	return context.type === 'storybook-mcp-dev';
+}
diff --git a/eval/lib/evaluations/coverage.ts b/eval/lib/evaluations/coverage.ts
@@ -0,0 +1,117 @@
+import * as path from 'node:path';
+import * as fs from 'node:fs/promises';
+import type { EvaluationSummary } from '../../types';
+import type { CoverageFiles, CoverageSummary } from './result-types';
+import { createCoverageMap } from 'istanbul-lib-coverage';
+import { log } from '@clack/prompts';
+
+export async function computeCoverage(
+	projectPath: string,
+	resultsPath: string,
+): Promise<{
+	coverage?: EvaluationSummary['coverage'];
+	coverageFiles?: CoverageFiles;
+}> {
+	let coverage: EvaluationSummary['coverage'];
+	let coverageFiles: CoverageFiles | undefined;
+
+	const finalCoveragePath = path.join(
+		projectPath,
+		'coverage',
+		'coverage-final.json',
+	);
+
+	try {
+		let normalizedTotal: CoverageSummary | undefined;
+
+		const coverageData = JSON.parse(
+			await fs.readFile(finalCoveragePath, 'utf8'),
+		);
-		const coverageData = JSON.parse(
-			await fs.readFile(finalCoveragePath, 'utf8'),
-		);
+		const { default: coverageData } = await import(finalCoveragePath, { type: 'json' });
-		const coverageData = JSON.parse(
-			await fs.readFile(finalCoveragePath, 'utf8'),
-		);
+		const { default: coverageData } = await import(finalCoveragePath, { type: 'json' });
+
+		// Derive from coverage-final using istanbul-lib-coverage
+		const coverageMap = createCoverageMap(coverageData);
+		const summary = coverageMap.getCoverageSummary().toJSON();
+		const coverageJson = coverageMap.toJSON();
+
+		coverageFiles = {};
+
+		for (const filePath of Object.keys(coverageJson)) {
+			if (filePath === 'total') continue;
+			const fileCoverage = coverageMap.fileCoverageFor(filePath);
+			const fileSummary = fileCoverage.toSummary().toJSON();
+			let source: string | undefined;
+			try {
+				source = await fs.readFile(filePath, 'utf8');
+			} catch {
+				source = undefined;
+			}
+
+			let lineHits: Record<string, number> | undefined;
+			let branchesByLine:
+				| Record<string, { covered: number | null; total: number | null }>
+				| undefined;
+			try {
+				lineHits = fileCoverage.getLineCoverage() as Record<string, number>;
+				const branches = fileCoverage.getBranchCoverageByLine?.();
+				if (branches && typeof branches === 'object') {
+					branchesByLine = {};
+					for (const [line, data] of Object.entries(
+						branches as Record<string, any>,
+					)) {
+						branchesByLine[line] = {
+							covered: data.covered ?? null,
+							total: data.total ?? null,
+						};
+					}
+				}
+			} catch {
+				log.warning(`Failed to get branch coverage for file ${filePath}`);
+			}
+
+			coverageFiles[filePath] = {
+				branches: { pct: fileSummary.branches.pct },
+				functions: { pct: fileSummary.functions.pct },
+				lines: { pct: fileSummary.lines.pct },
+				statements: { pct: fileSummary.statements.pct },
+				lineHits,
+				branchesByLine,
+				source,
+			};
+		}
+		normalizedTotal = {
+			branches: { pct: summary.branches.pct },
+			functions: { pct: summary.functions.pct },
+			lines: { pct: summary.lines.pct },
+			statements: { pct: summary.statements.pct },
+		};
+
+		coverage = {
+			branches: normalizedTotal.branches?.pct ?? null,
+			functions: normalizedTotal.functions?.pct ?? null,
+			lines: normalizedTotal.lines?.pct ?? null,
+			statements: normalizedTotal.statements?.pct ?? null,
+		};
+
+		const targetCoveragePath = path.join(
+			resultsPath,
+			'coverage',
+			'coverage-summary.json',
+		);
+		await fs.mkdir(path.dirname(targetCoveragePath), { recursive: true });
+		await fs.writeFile(
+			targetCoveragePath,
+			JSON.stringify({ total: normalizedTotal }, null, 2),
+		);
+
+		await fs.writeFile(
+			path.join(resultsPath, 'coverage', 'coverage-final.json'),
+			JSON.stringify(coverageFiles, null, 2),
+		);
+	} catch {
+		log.warning(`Failed to compute coverage for project ${projectPath}`);
+		coverage = undefined;
+		coverageFiles = undefined;
+	}
+
+	return { coverage, coverageFiles };
+}
diff --git a/eval/lib/evaluations/parse-tests.ts b/eval/lib/evaluations/parse-tests.ts
@@ -0,0 +1,65 @@
+import * as path from 'node:path';
+import * as fs from 'node:fs/promises';
+import type { JsonAssertionResult, JsonTestResults } from 'vitest/reporters';
+import type { A11yViolations, StoryResult, TestSummary } from './result-types';
+
+export async function parseTestResults(resultsPath: string): Promise<{
+	testSummary: TestSummary;
+	a11y: A11yViolations;
+	storyResults: StoryResult[];
+}> {
+	const testResultsPath = path.join(resultsPath, 'tests.json');
+	const { default: jsonTestResults } = (await import(testResultsPath, {
+		with: { type: 'json' },
+	})) as { default: JsonTestResults };
+
+	// write the file again to pretty-print it
+	await fs.writeFile(testResultsPath, JSON.stringify(jsonTestResults, null, 2));
+
+	const a11yViolations: A11yViolations = {};
+	const storyAssertions: Record<
+		string,
+		{ status: JsonAssertionResult['status'] }
+	> = {};
+
+	const testSuites = jsonTestResults.testResults
+		? Object.values(jsonTestResults.testResults)
+		: [];
+
+	for (const jsonTestResult of testSuites) {
+		for (const assertionResult of jsonTestResult.assertionResults ?? []) {
+			const storyId = (assertionResult.meta as any)?.storyId;
+			if (!storyId) continue;
+
+			storyAssertions[storyId] = {
+				status: assertionResult.status,
-			storyAssertions[storyId] = {
-				status: assertionResult.status,
+			// Aggregate statuses: if any assertion fails, mark as failed; otherwise, keep the worst status
+			const prevStatus = storyAssertions[storyId]?.status;
+			let newStatus = assertionResult.status;
+			if (prevStatus) {
+				// Priority: failed > todo > skipped > passed
+				const statusPriority = { failed: 3, todo: 2, skipped: 1, passed: 0 };
+				const prevPriority = statusPriority[prevStatus] ?? -1;
+				const newPriority = statusPriority[newStatus] ?? -1;
+				newStatus = prevPriority > newPriority ? prevStatus : newStatus;
+			}
+			storyAssertions[storyId] = {
+				status: newStatus,
-			storyAssertions[storyId] = {
-				status: assertionResult.status,
+			// Aggregate statuses: if any assertion fails, mark as failed; otherwise, keep the worst status
+			const prevStatus = storyAssertions[storyId]?.status;
+			let newStatus = assertionResult.status;
+			if (prevStatus) {
+				// Priority: failed > todo > skipped > passed
+				const statusPriority = { failed: 3, todo: 2, skipped: 1, passed: 0 };
+				const prevPriority = statusPriority[prevStatus] ?? -1;
+				const newPriority = statusPriority[newStatus] ?? -1;
+				newStatus = prevPriority > newPriority ? prevStatus : newStatus;
+			}
+			storyAssertions[storyId] = {
+				status: newStatus,
+			};
+
+			for (const report of (assertionResult.meta as any).reports ?? []) {
+				if (report.type === 'a11y' && report.result?.violations?.length > 0) {
+					a11yViolations[storyId] = report.result.violations;
+				}
+			}
+		}
+	}
+
+	const storyResults = Object.entries(storyAssertions).map(
+		([storyId, { status }]) =>
+			({
+				storyId,
+				status,
+			}) as StoryResult,
+	);
+
+	const testsPassed = storyResults.filter((s) => s.status === 'passed').length;
+	const testsFailed = storyResults.length - testsPassed;
+
+	return {
+		testSummary: {
+			passed: testsPassed,
+			failed: testsFailed,
+		},
+		a11y: a11yViolations,
+		storyResults,
+	};
+}
diff --git a/eval/lib/evaluations/result-types.ts b/eval/lib/evaluations/result-types.ts
@@ -0,0 +1,29 @@
+import type { EvaluationSummary } from '../../types';
+
+export type TestSummary = Pick<EvaluationSummary['test'], 'passed' | 'failed'>;
+
+export type StoryResult = {
+	storyId: string;
+	status: 'passed' | 'failed';
+};
+
+export type A11yViolations = Record<string, any[]>;
+
+export type CoverageSummary = {
+	branches: { pct: number | null };
+	functions: { pct: number | null };
+	lines: { pct: number | null };
+	statements: { pct: number | null };
+};
+
+export type CoverageFiles = Record<
+	string,
+	{
+		lineHits?: Record<string, number>;
+		branchesByLine?: Record<
+			string,
+			{ covered: number | null; total: number | null }
+		>;
+		source?: string;
+	} & CoverageSummary
+>;
diff --git a/eval/lib/evaluations/run-tests.ts b/eval/lib/evaluations/run-tests.ts
@@ -0,0 +1,37 @@
+import * as path from 'node:path';
+import * as fs from 'node:fs/promises';
+import { x } from 'tinyexec';
+import { dedent } from 'ts-dedent';
+import type { ExperimentArgs } from '../../types';
+
+export async function runTests(
+	experimentArgs: ExperimentArgs,
+	testScript: string,
+): Promise<number> {
+	const { projectPath, resultsPath } = experimentArgs;
+	const result = await x('pnpm', [testScript], {
+		nodeOptions: { cwd: projectPath },
+	});
+
+	await fs.writeFile(
+		path.join(resultsPath, 'tests.md'),
+		dedent`# Test Results
+
+	**Exit Code:** ${result.exitCode}
+
+	## stdout
+
+	\`\`\`sh
+	${result.stdout}
+	\`\`\`
+
+	## stderr
+
+	\`\`\`
+	${result.stderr}
+	\`\`\`
+	`,
+	);
+
+	return result.exitCode ?? 0;
+}