Skip to content

Commit 1f096d6

Browse files
authored
Merge pull request #120 from apecloud/support/update-search-bar
chore: update search bar indexing
2 parents 90e6837 + b949209 commit 1f096d6

File tree

9 files changed

+830
-203
lines changed

9 files changed

+830
-203
lines changed
68.4 KB
Loading

scripts/generate-docs-index.mjs

Lines changed: 148 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,127 @@ const ROOT_DIR = path.join(
99
);
1010
const OUTPUT = path.join(ROOT_DIR, 'public/docs-index.json');
1111

12+
// Simple keyword extraction implementation
13+
function extractKeywords(text, minLength = 3, maxCount = 20) {
14+
// Common stop words
15+
const stopWords = new Set([
16+
'the', 'is', 'at', 'which', 'on', 'and', 'a', 'to', 'are', 'as', 'was', 'were',
17+
'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
18+
'should', 'may', 'might', 'can', 'must', 'shall', 'of', 'in', 'for', 'with', 'by',
19+
'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above',
20+
'below', 'to', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
21+
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
22+
'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
23+
'so', 'than', 'too', 'very', 'can', 'just', 'now', 'also', 'if', 'this',
24+
'that', 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours',
25+
'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
26+
'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
27+
'they', 'them', 'their', 'theirs', 'themselves'
28+
]);
29+
30+
const words = text
31+
.toLowerCase()
32+
.replace(/[^\w\s]/g, ' ')
33+
.split(/\s+/)
34+
.filter(word =>
35+
word.length >= minLength &&
36+
!stopWords.has(word) &&
37+
!/^\d+$/.test(word)
38+
);
39+
40+
// Calculate word frequency
41+
const wordCounts = {};
42+
words.forEach(word => {
43+
wordCounts[word] = (wordCounts[word] || 0) + 1;
44+
});
45+
46+
// Sort by frequency and return top N words
47+
return Object.entries(wordCounts)
48+
.sort(([, a], [, b]) => b - a)
49+
.slice(0, maxCount)
50+
.map(([word]) => word);
51+
}
52+
53+
// 生成摘要
54+
function generateSummary(content, maxLength = 200) {
55+
// 移除markdown语法
56+
const plainText = content
57+
.replace(/```[\s\S]*?```/g, '')
58+
.replace(/`[^`]+`/g, '')
59+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
60+
.replace(/#{1,6}\s+/g, '')
61+
.replace(/\*\*/g, '')
62+
.replace(/\*/g, '')
63+
.replace(/\n/g, ' ')
64+
.replace(/\s+/g, ' ')
65+
.trim();
66+
67+
if (plainText.length <= maxLength) {
68+
return plainText;
69+
}
70+
71+
// 找到最后一个完整句子
72+
const truncated = plainText.substring(0, maxLength);
73+
const lastSentenceEnd = Math.max(
74+
truncated.lastIndexOf('.'),
75+
truncated.lastIndexOf('!'),
76+
truncated.lastIndexOf('?')
77+
);
78+
79+
if (lastSentenceEnd > maxLength * 0.7) {
80+
return truncated.substring(0, lastSentenceEnd + 1);
81+
}
82+
83+
return truncated + '...';
84+
}
85+
86+
// 提取文档类型和类别
87+
function extractMetadata(filePath, frontmatter) {
88+
const segments = filePath.split('/');
89+
let category = '';
90+
let docType = '';
91+
92+
if (filePath.startsWith('blogs/')) {
93+
docType = 'blog';
94+
category = 'Blog';
95+
} else if (filePath.startsWith('docs/')) {
96+
docType = 'documentation';
97+
98+
// 从路径中提取产品类别
99+
const productMatch = filePath.match(/docs\/en\/[^/]*\/kubeblocks-for-([^/]+)/);
100+
if (productMatch) {
101+
category = productMatch[1];
102+
} else if (filePath.includes('user_docs')) {
103+
category = 'user-guide';
104+
} else if (filePath.includes('cli')) {
105+
category = 'cli';
106+
} else if (filePath.includes('release_notes')) {
107+
category = 'release-notes';
108+
} else {
109+
category = 'general';
110+
}
111+
}
112+
113+
return {
114+
docType,
115+
category,
116+
tags: frontmatter.tags || [],
117+
sidebar_position: frontmatter.sidebar_position,
118+
sidebar_label: frontmatter.sidebar_label,
119+
};
120+
}
121+
12122
async function main() {
13-
// 匹配 docs/en 和 blogs/en 下的 md/mdx 文件,排除 docs/en/**/cli/**
123+
// 只匹配 docs/en/preview 和 blogs/en 下的 md/mdx 文件,排除 CLI 和 release notes
14124
const files = await fg(
15125
[
16-
'docs/en/**/*.md',
17-
'docs/en/**/*.mdx',
126+
'docs/en/preview/**/*.md',
127+
'docs/en/preview/**/*.mdx',
18128
'blogs/en/**/*.md',
19129
'blogs/en/**/*.mdx',
20-
'!docs/en/**/cli/**',
21-
'!docs/en/**/release_notes',
130+
'!docs/en/preview/**/cli/**',
131+
'!docs/en/preview/**/release_notes/**',
132+
'!docs/en/release-*/**', // 明确排除所有release版本目录
22133
],
23134
{ cwd: ROOT_DIR, absolute: true },
24135
);
@@ -28,13 +139,14 @@ async function main() {
28139
const relPath = path.relative(ROOT_DIR, file);
29140
const raw = fs.readFileSync(file, 'utf-8');
30141
const { data, content } = matter(raw);
142+
31143
// 替换 path 中的 blogs/en 和 docs/en,并去除 .md/.mdx 后缀
32144
let normPath = relPath
33-
.replace(/^blogs\/en\//, 'blogs/')
145+
.replace(/^blogs\/en\//, 'blog/')
34146
.replace(/^docs\/en\//, 'docs/');
35147
normPath = normPath.replace(/\.(md|mdx)$/, '');
36148

37-
// 过滤掉 mdx 的 import/export 语句和宏/JSX函数(如 import ...、export ...、<XXX ... />、{ ... })
149+
// 过滤掉 mdx 的 import/export 语句和宏/JSX函数
38150
const filteredContent = content
39151
.split('\n')
40152
.filter(
@@ -48,13 +160,38 @@ async function main() {
48160
.replace(/<[^>]+>/g, '') // 去除内联 JSX 标签
49161
.replace(/\{[^}]+\}/g, ''); // 去除内联 JS 表达式
50162

51-
const description = filteredContent.replace(/\n/g, ' ').slice(0, 300);
163+
// 提取更多内容,不限制在2000字符
164+
const fullContent = filteredContent;
165+
166+
// 生成摘要和关键词
167+
const summary = generateSummary(filteredContent);
168+
const keywords = extractKeywords(filteredContent);
169+
const metadata = extractMetadata(relPath, data);
170+
171+
// 提取标题层级结构
172+
const headings = [];
173+
const headingMatches = content.match(/^#{1,6}\s+.+$/gm);
174+
if (headingMatches) {
175+
headingMatches.forEach(heading => {
176+
const level = (heading.match(/^#+/) || [''])[0].length;
177+
const text = heading.replace(/^#+\s+/, '').trim();
178+
headings.push({ level, text });
179+
});
180+
}
181+
52182
return {
53183
id: relPath.replace(/\//g, '_').replace(/\.(md|mdx)$/, ''),
54-
title: data.title || path.basename(file),
55-
content: filteredContent.slice(0, 2000),
184+
title: data.title || data.sidebar_label || path.basename(file, path.extname(file)),
185+
content: fullContent,
56186
path: normPath,
57-
description,
187+
description: data.description || summary,
188+
summary,
189+
keywords,
190+
headings,
191+
...metadata,
192+
// 保留原有字段以兼容现有搜索
193+
lastModified: fs.statSync(file).mtime.toISOString(),
194+
wordCount: fullContent.split(/\s+/).length,
58195
};
59196
});
60197

src/app/[locale]/ElevationScrollAppBar.tsx

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import SearchModal from '@/components/SearchModal';
44
import { SlackIconNoColor } from '@/components/icons';
55
import { useI18n } from '@/locales/client';
66
import { useGlobalStore } from '@/store/global';
7+
import { searchBarStyles } from '@/styles/searchBar.styles';
78
import {
89
GitHub,
910
LaunchOutlined,
@@ -16,6 +17,7 @@ import {
1617
Button,
1718
IconButton,
1819
Stack,
20+
TextField,
1921
Toolbar,
2022
useMediaQuery,
2123
useTheme,
@@ -135,10 +137,26 @@ export const ElevationScrollAppBar = (props: AppBarProps) => {
135137
>
136138
KubeBlocks Cloud
137139
</Button>
138-
<Box sx={{ minWidth: 36, maxWidth: 40, flex: 0 }}>
139-
<IconButton onClick={() => setShowSearch(true)}>
140-
<SearchIcon />
141-
</IconButton>
140+
<Box sx={searchBarStyles.container}>
141+
<TextField
142+
size="small"
143+
placeholder={mobile ? "Search..." : "Search docs..."}
144+
variant="outlined"
145+
onClick={() => setShowSearch(true)}
146+
InputProps={{
147+
startAdornment: <SearchIcon sx={searchBarStyles.searchIcon} />,
148+
endAdornment: !mobile && (
149+
<Box sx={searchBarStyles.shortcutContainer}>
150+
<Box component="kbd" sx={searchBarStyles.shortcutKey}>
151+
⌘K
152+
</Box>
153+
</Box>
154+
),
155+
readOnly: true,
156+
sx: searchBarStyles.inputRoot,
157+
}}
158+
sx={searchBarStyles.textField}
159+
/>
142160
</Box>
143161
</Stack>
144162
<Box

src/app/api/search-index/route.ts

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { readFileSync, readdirSync } from 'fs';
1+
import { readFileSync, readdirSync, existsSync } from 'fs';
22
import matter from 'gray-matter';
33
import { NextResponse } from 'next/server';
44
import { join } from 'path';
@@ -8,6 +8,15 @@ interface SearchDocument {
88
title: string;
99
content: string;
1010
path: string;
11+
description?: string;
12+
summary?: string;
13+
keywords?: string[];
14+
headings?: Array<{ level: number; text: string }>;
15+
docType?: string;
16+
category?: string;
17+
tags?: string[];
18+
lastModified?: string;
19+
wordCount?: number;
1120
}
1221

1322
// 递归获取所有 MDX 文件
@@ -32,8 +41,22 @@ function extractContent(filePath: string): SearchDocument {
3241
const content = readFileSync(filePath, 'utf-8');
3342
const { data, content: markdownContent } = matter(content);
3443

44+
// 过滤掉 mdx 的 import/export 语句和宏/JSX函数
45+
const filteredContent = markdownContent
46+
.split('\n')
47+
.filter(
48+
(line) =>
49+
!line.trim().startsWith('import ') &&
50+
!line.trim().startsWith('export ') &&
51+
!/^<[A-Z][A-Za-z0-9]*[\s/>]/.test(line.trim()) &&
52+
!/^\{.*\}$/.test(line.trim()),
53+
)
54+
.join('\n')
55+
.replace(/<[^>]+>/g, '') // 去除内联 JSX 标签
56+
.replace(/\{[^}]+\}/g, ''); // 去除内联 JS 表达式
57+
3558
// 移除 markdown 语法,只保留纯文本
36-
const plainContent = markdownContent
59+
const plainContent = filteredContent
3760
.replace(/```[\s\S]*?```/g, '') // 移除代码块
3861
.replace(/`.*?`/g, '') // 移除行内代码
3962
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // 将链接转换为纯文本
@@ -44,27 +67,93 @@ function extractContent(filePath: string): SearchDocument {
4467
.replace(/\s+/g, ' ') // 将多个空格转换为单个空格
4568
.trim();
4669

70+
// 生成摘要
71+
const summary = plainContent.length > 200 ?
72+
plainContent.substring(0, 200) + '...' :
73+
plainContent;
74+
75+
// 提取关键词
76+
const keywords = plainContent
77+
.toLowerCase()
78+
.split(/\s+/)
79+
.filter(word => word.length > 3)
80+
.reduce((acc: string[], word) => {
81+
if (!acc.includes(word)) acc.push(word);
82+
return acc;
83+
}, [])
84+
.slice(0, 20);
85+
86+
// 提取标题层级
87+
const headings: Array<{ level: number; text: string }> = [];
88+
const headingMatches = markdownContent.match(/^#{1,6}\s+.+$/gm);
89+
if (headingMatches) {
90+
headingMatches.forEach(heading => {
91+
const level = (heading.match(/^#+/) || [''])[0].length;
92+
const text = heading.replace(/^#+\s+/, '').trim();
93+
headings.push({ level, text });
94+
});
95+
}
96+
97+
// 确定文档类型和类别
98+
let docType = 'documentation';
99+
let category = 'general';
100+
101+
if (filePath.includes('/blogs/')) {
102+
docType = 'blog';
103+
category = 'Blog';
104+
} else if (filePath.includes('kubeblocks-for-')) {
105+
const match = filePath.match(/kubeblocks-for-([^/]+)/);
106+
category = match ? match[1] : 'general';
107+
} else if (filePath.includes('cli')) {
108+
category = 'CLI';
109+
}
110+
47111
return {
48112
id: filePath,
49113
title:
50114
data.title ||
115+
data.sidebar_label ||
51116
filePath
52117
.split('/')
53118
.pop()
54119
?.replace(/\.mdx?$/, '') ||
55120
'',
56121
content: plainContent,
57122
path: filePath.replace(/\.mdx?$/, ''),
123+
description: data.description || summary,
124+
summary,
125+
keywords,
126+
headings,
127+
docType,
128+
category,
129+
tags: data.tags || [],
130+
wordCount: plainContent.split(/\s+/).length,
58131
};
59132
}
60133

61134
export async function GET() {
62135
try {
63-
const docsDir = join(process.cwd(), 'docs');
64-
const mdxFiles = getAllMdxFiles(docsDir);
136+
const rootDir = process.cwd();
137+
138+
// 只获取preview目录下的文档和博客文件
139+
const allFiles: string[] = [];
140+
141+
// 获取docs/en/preview下的文件
142+
const previewDir = join(rootDir, 'docs', 'en', 'preview');
143+
if (existsSync(previewDir)) {
144+
const previewFiles = getAllMdxFiles(previewDir, 'docs/en/preview');
145+
allFiles.push(...previewFiles.map(file => join(rootDir, file)));
146+
}
147+
148+
// 获取blogs/en下的文件
149+
const blogsEnDir = join(rootDir, 'blogs', 'en');
150+
if (existsSync(blogsEnDir)) {
151+
const blogFiles = getAllMdxFiles(blogsEnDir, 'blogs/en');
152+
allFiles.push(...blogFiles.map(file => join(rootDir, file)));
153+
}
65154

66-
const documents = mdxFiles.map((filePath) =>
67-
extractContent(join(docsDir, filePath)),
155+
const documents = allFiles.map((filePath) =>
156+
extractContent(filePath),
68157
);
69158

70159
return NextResponse.json(documents);

0 commit comments

Comments
 (0)