@@ -9,16 +9,127 @@ const ROOT_DIR = path.join(
99) ;
1010const OUTPUT = path . join ( ROOT_DIR , 'public/docs-index.json' ) ;
1111
12+ // Simple keyword extraction implementation
13+ function extractKeywords ( text , minLength = 3 , maxCount = 20 ) {
14+ // Common stop words
15+ const stopWords = new Set ( [
16+ 'the' , 'is' , 'at' , 'which' , 'on' , 'and' , 'a' , 'to' , 'are' , 'as' , 'was' , 'were' ,
17+ 'been' , 'be' , 'have' , 'has' , 'had' , 'do' , 'does' , 'did' , 'will' , 'would' , 'could' ,
18+ 'should' , 'may' , 'might' , 'can' , 'must' , 'shall' , 'of' , 'in' , 'for' , 'with' , 'by' ,
19+ 'from' , 'up' , 'about' , 'into' , 'through' , 'during' , 'before' , 'after' , 'above' ,
20+ 'below' , 'to' , 'under' , 'again' , 'further' , 'then' , 'once' , 'here' , 'there' ,
21+ 'when' , 'where' , 'why' , 'how' , 'all' , 'any' , 'both' , 'each' , 'few' , 'more' ,
22+ 'most' , 'other' , 'some' , 'such' , 'no' , 'nor' , 'not' , 'only' , 'own' , 'same' ,
23+ 'so' , 'than' , 'too' , 'very' , 'can' , 'just' , 'now' , 'also' , 'if' , 'this' ,
24+ 'that' , 'these' , 'those' , 'i' , 'me' , 'my' , 'myself' , 'we' , 'our' , 'ours' ,
25+ 'ourselves' , 'you' , 'your' , 'yours' , 'yourself' , 'yourselves' , 'he' , 'him' ,
26+ 'his' , 'himself' , 'she' , 'her' , 'hers' , 'herself' , 'it' , 'its' , 'itself' ,
27+ 'they' , 'them' , 'their' , 'theirs' , 'themselves'
28+ ] ) ;
29+
30+ const words = text
31+ . toLowerCase ( )
32+ . replace ( / [ ^ \w \s ] / g, ' ' )
33+ . split ( / \s + / )
34+ . filter ( word =>
35+ word . length >= minLength &&
36+ ! stopWords . has ( word ) &&
37+ ! / ^ \d + $ / . test ( word )
38+ ) ;
39+
40+ // Calculate word frequency
41+ const wordCounts = { } ;
42+ words . forEach ( word => {
43+ wordCounts [ word ] = ( wordCounts [ word ] || 0 ) + 1 ;
44+ } ) ;
45+
46+ // Sort by frequency and return top N words
47+ return Object . entries ( wordCounts )
48+ . sort ( ( [ , a ] , [ , b ] ) => b - a )
49+ . slice ( 0 , maxCount )
50+ . map ( ( [ word ] ) => word ) ;
51+ }
52+
53+ // 生成摘要
54+ function generateSummary ( content , maxLength = 200 ) {
55+ // 移除markdown语法
56+ const plainText = content
57+ . replace ( / ` ` ` [ \s \S ] * ?` ` ` / g, '' )
58+ . replace ( / ` [ ^ ` ] + ` / g, '' )
59+ . replace ( / \[ ( [ ^ \] ] + ) \] \( [ ^ ) ] + \) / g, '$1' )
60+ . replace ( / # { 1 , 6 } \s + / g, '' )
61+ . replace ( / \* \* / g, '' )
62+ . replace ( / \* / g, '' )
63+ . replace ( / \n / g, ' ' )
64+ . replace ( / \s + / g, ' ' )
65+ . trim ( ) ;
66+
67+ if ( plainText . length <= maxLength ) {
68+ return plainText ;
69+ }
70+
71+ // 找到最后一个完整句子
72+ const truncated = plainText . substring ( 0 , maxLength ) ;
73+ const lastSentenceEnd = Math . max (
74+ truncated . lastIndexOf ( '.' ) ,
75+ truncated . lastIndexOf ( '!' ) ,
76+ truncated . lastIndexOf ( '?' )
77+ ) ;
78+
79+ if ( lastSentenceEnd > maxLength * 0.7 ) {
80+ return truncated . substring ( 0 , lastSentenceEnd + 1 ) ;
81+ }
82+
83+ return truncated + '...' ;
84+ }
85+
86+ // 提取文档类型和类别
87+ function extractMetadata ( filePath , frontmatter ) {
88+ const segments = filePath . split ( '/' ) ;
89+ let category = '' ;
90+ let docType = '' ;
91+
92+ if ( filePath . startsWith ( 'blogs/' ) ) {
93+ docType = 'blog' ;
94+ category = 'Blog' ;
95+ } else if ( filePath . startsWith ( 'docs/' ) ) {
96+ docType = 'documentation' ;
97+
98+ // 从路径中提取产品类别
99+ const productMatch = filePath . match ( / d o c s \/ e n \/ [ ^ / ] * \/ k u b e b l o c k s - f o r - ( [ ^ / ] + ) / ) ;
100+ if ( productMatch ) {
101+ category = productMatch [ 1 ] ;
102+ } else if ( filePath . includes ( 'user_docs' ) ) {
103+ category = 'user-guide' ;
104+ } else if ( filePath . includes ( 'cli' ) ) {
105+ category = 'cli' ;
106+ } else if ( filePath . includes ( 'release_notes' ) ) {
107+ category = 'release-notes' ;
108+ } else {
109+ category = 'general' ;
110+ }
111+ }
112+
113+ return {
114+ docType,
115+ category,
116+ tags : frontmatter . tags || [ ] ,
117+ sidebar_position : frontmatter . sidebar_position ,
118+ sidebar_label : frontmatter . sidebar_label ,
119+ } ;
120+ }
121+
12122async function main ( ) {
13- // 匹配 docs/en 和 blogs/en 下的 md/mdx 文件,排除 docs/en/**/cli/**
123+ // 只匹配 docs/en/preview 和 blogs/en 下的 md/mdx 文件,排除 CLI 和 release notes
14124 const files = await fg (
15125 [
16- 'docs/en/**/*.md' ,
17- 'docs/en/**/*.mdx' ,
126+ 'docs/en/preview/ **/*.md' ,
127+ 'docs/en/preview/ **/*.mdx' ,
18128 'blogs/en/**/*.md' ,
19129 'blogs/en/**/*.mdx' ,
20- '!docs/en/**/cli/**' ,
21- '!docs/en/**/release_notes' ,
130+ '!docs/en/preview/**/cli/**' ,
131+ '!docs/en/preview/**/release_notes/**' ,
132+ '!docs/en/release-*/**' , // 明确排除所有release版本目录
22133 ] ,
23134 { cwd : ROOT_DIR , absolute : true } ,
24135 ) ;
@@ -28,13 +139,14 @@ async function main() {
28139 const relPath = path . relative ( ROOT_DIR , file ) ;
29140 const raw = fs . readFileSync ( file , 'utf-8' ) ;
30141 const { data, content } = matter ( raw ) ;
142+
31143 // 替换 path 中的 blogs/en 和 docs/en,并去除 .md/.mdx 后缀
32144 let normPath = relPath
33- . replace ( / ^ b l o g s \/ e n \/ / , 'blogs /' )
145+ . replace ( / ^ b l o g s \/ e n \/ / , 'blog /' )
34146 . replace ( / ^ d o c s \/ e n \/ / , 'docs/' ) ;
35147 normPath = normPath . replace ( / \. ( m d | m d x ) $ / , '' ) ;
36148
37- // 过滤掉 mdx 的 import/export 语句和宏/JSX函数(如 import ...、export ...、<XXX ... />、{ ... })
149+ // 过滤掉 mdx 的 import/export 语句和宏/JSX函数
38150 const filteredContent = content
39151 . split ( '\n' )
40152 . filter (
@@ -48,13 +160,38 @@ async function main() {
48160 . replace ( / < [ ^ > ] + > / g, '' ) // 去除内联 JSX 标签
49161 . replace ( / \{ [ ^ } ] + \} / g, '' ) ; // 去除内联 JS 表达式
50162
51- const description = filteredContent . replace ( / \n / g, ' ' ) . slice ( 0 , 300 ) ;
163+ // 提取更多内容,不限制在2000字符
164+ const fullContent = filteredContent ;
165+
166+ // 生成摘要和关键词
167+ const summary = generateSummary ( filteredContent ) ;
168+ const keywords = extractKeywords ( filteredContent ) ;
169+ const metadata = extractMetadata ( relPath , data ) ;
170+
171+ // 提取标题层级结构
172+ const headings = [ ] ;
173+ const headingMatches = content . match ( / ^ # { 1 , 6 } \s + .+ $ / gm) ;
174+ if ( headingMatches ) {
175+ headingMatches . forEach ( heading => {
176+ const level = ( heading . match ( / ^ # + / ) || [ '' ] ) [ 0 ] . length ;
177+ const text = heading . replace ( / ^ # + \s + / , '' ) . trim ( ) ;
178+ headings . push ( { level, text } ) ;
179+ } ) ;
180+ }
181+
52182 return {
53183 id : relPath . replace ( / \/ / g, '_' ) . replace ( / \. ( m d | m d x ) $ / , '' ) ,
54- title : data . title || path . basename ( file ) ,
55- content : filteredContent . slice ( 0 , 2000 ) ,
184+ title : data . title || data . sidebar_label || path . basename ( file , path . extname ( file ) ) ,
185+ content : fullContent ,
56186 path : normPath ,
57- description,
187+ description : data . description || summary ,
188+ summary,
189+ keywords,
190+ headings,
191+ ...metadata ,
192+ // 保留原有字段以兼容现有搜索
193+ lastModified : fs . statSync ( file ) . mtime . toISOString ( ) ,
194+ wordCount : fullContent . split ( / \s + / ) . length ,
58195 } ;
59196 } ) ;
60197
0 commit comments