-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparser.go
306 lines (288 loc) · 8.85 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
// Package raggo provides a flexible and extensible document parsing system
// for RAG (Retrieval-Augmented Generation) applications. The system supports
// multiple file formats and can be extended with custom parsers.
package raggo
import (
"context"
"github.com/teilomillet/raggo/rag"
)
// Document represents a parsed document with its content and metadata.
// The structure includes:
// - Content: The extracted text from the document
// - Metadata: Additional information about the document
//
// Example:
//
// doc := Document{
// Content: "Extracted text content...",
// Metadata: map[string]string{
// "file_type": "pdf",
// "file_path": "/path/to/doc.pdf",
// },
// }
type Document = rag.Document
// Parser defines the interface for document parsing implementations.
// Any type implementing this interface can be registered to handle
// specific file types. The interface is designed to be simple yet
// powerful enough to support various parsing strategies.
//
// Implementations must handle:
// - File access and reading
// - Content extraction
// - Metadata collection
// - Error handling
//
// Example:
//
// // Create a new parser with default settings
// parser := NewParser()
//
// // Parse a document
// doc, err := parser.Parse("document.pdf")
// if err != nil {
// log.Fatal(err)
// }
// fmt.Printf("Content: %s\n", doc.Content)
type Parser interface {
// Parse processes a file and returns its content and metadata.
// Returns an error if the parsing operation fails.
Parse(filePath string) (Document, error)
}
// parserWrapper combines Parser and Loader capabilities into a single type.
// It implements both the Parser interface for document parsing and provides
// loading functionality through an embedded rag.Loader.
//
// The wrapper uses dependency injection to allow customization of both
// the parser and loader components, making it highly configurable and
// testable.
//
// Example usage with custom loader:
//
// customLoader := rag.NewLoader(
// rag.WithTimeout(time.Minute),
// rag.WithTempDir("/custom/temp"),
// )
//
// parser := NewParser(
// WithLoader(customLoader),
// )
type parserWrapper struct {
parser Parser // Core parsing implementation
loader *rag.Loader // Document loading capabilities
}
// ParserOption defines functional options for configuring a Parser.
// This follows the functional options pattern, allowing flexible and
// extensible configuration of the parser without breaking existing code.
//
// Custom options can be created by implementing this function type
// and modifying the parserWrapper fields as needed.
//
// Example creating a custom option:
//
// func WithCustomTimeout(timeout time.Duration) ParserOption {
// return func(pw *parserWrapper) {
// pw.loader = rag.NewLoader(rag.WithTimeout(timeout))
// }
// }
type ParserOption func(*parserWrapper)
// WithLoader injects a custom loader into the parser.
// This option allows you to provide a pre-configured rag.Loader
// instance with custom settings for timeout, temporary directory,
// HTTP client, or other loader-specific configurations.
//
// Example:
//
// customLoader := rag.NewLoader(
// rag.WithTimeout(time.Minute),
// rag.WithTempDir("/custom/temp"),
// )
//
// parser := NewParser(
// WithLoader(customLoader),
// )
func WithLoader(loader *rag.Loader) ParserOption {
return func(pw *parserWrapper) {
pw.loader = loader
}
}
// NewParser creates a new Parser with the given options.
// It initializes a parserWrapper with default settings and applies
// any provided configuration options. The resulting parser implements
// both document parsing and loading capabilities.
//
// Default configuration includes:
// - Standard rag.Loader with default timeout and temp directory
// - Default parser manager for handling various file types
// - Built-in support for common file formats
//
// Example:
//
// // Create parser with default settings
// parser := NewParser()
//
// // Create parser with custom loader
// parser := NewParser(
// WithLoader(customLoader),
// )
//
// // Create parser with multiple options
// parser := NewParser(
// WithLoader(customLoader),
// WithCustomOption(...),
// )
func NewParser(opts ...ParserOption) Parser {
pw := &parserWrapper{
parser: rag.NewParserManager(),
loader: rag.NewLoader(),
}
for _, opt := range opts {
opt(pw)
}
return pw
}
// Parse implements the Parser interface by processing a file and extracting
// its content and metadata. It delegates the actual parsing to the underlying
// parser implementation while maintaining the option to preprocess files
// using the loader capabilities.
//
// Returns:
// - Document: Contains the extracted content and metadata
// - error: Any error encountered during parsing
//
// Example:
//
// doc, err := parser.Parse("document.pdf")
// if err != nil {
// log.Fatal(err)
// }
// fmt.Printf("Content: %s\n", doc.Content)
func (pw *parserWrapper) Parse(filePath string) (Document, error) {
return pw.parser.Parse(filePath)
}
// LoadDir implements the Loader interface by recursively processing
// all files in a directory. It delegates to the embedded loader's
// implementation while maintaining the parser's context.
//
// Parameters:
// - ctx: Context for cancellation and timeout
// - dir: Directory path to process
//
// Returns:
// - []string: Paths to all processed files
// - error: Any error encountered during directory processing
//
// Example:
//
// paths, err := parser.LoadDir(ctx, "/path/to/docs")
// if err != nil {
// log.Fatal(err)
// }
// for _, path := range paths {
// fmt.Printf("Processed: %s\n", path)
// }
func (pw *parserWrapper) LoadDir(ctx context.Context, dir string) ([]string, error) {
return pw.loader.LoadDir(ctx, dir)
}
// LoadFile implements the Loader interface by processing a single file.
// It uses the embedded loader's implementation while maintaining the
// parser's context.
//
// Parameters:
// - ctx: Context for cancellation and timeout
// - path: Path to the file to process
//
// Returns:
// - string: Path to the processed file
// - error: Any error encountered during file processing
//
// Example:
//
// processedPath, err := parser.LoadFile(ctx, "document.pdf")
// if err != nil {
// log.Fatal(err)
// }
// fmt.Printf("Processed file at: %s\n", processedPath)
func (pw *parserWrapper) LoadFile(ctx context.Context, path string) (string, error) {
return pw.loader.LoadFile(ctx, path)
}
// LoadURL implements the Loader interface by downloading and processing
// a file from a URL. It uses the embedded loader's implementation while
// maintaining the parser's context.
//
// Parameters:
// - ctx: Context for cancellation and timeout
// - url: URL of the file to download and process
//
// Returns:
// - string: Path to the downloaded and processed file
// - error: Any error encountered during download or processing
//
// Example:
//
// processedPath, err := parser.LoadURL(ctx, "https://example.com/doc.pdf")
// if err != nil {
// log.Fatal(err)
// }
// fmt.Printf("Downloaded and processed file at: %s\n", processedPath)
func (pw *parserWrapper) LoadURL(ctx context.Context, url string) (string, error) {
return pw.loader.LoadURL(ctx, url)
}
// SetFileTypeDetector customizes how file types are detected.
// This allows for sophisticated file type detection beyond simple
// extension matching.
//
// Example:
//
// SetFileTypeDetector(parser, func(path string) string {
// // Custom logic to determine file type
// if strings.HasSuffix(path, ".md") {
// return "markdown"
// }
// return "unknown"
// })
func SetFileTypeDetector(p Parser, detector func(string) string) {
if pm, ok := p.(*rag.ParserManager); ok {
pm.SetFileTypeDetector(detector)
}
}
// WithParser adds a custom parser for a specific file type.
// This enables the parsing system to handle additional file formats
// through custom implementations.
//
// Example:
//
// // Add support for markdown files
// WithParser(parser, "markdown", &MarkdownParser{})
func WithParser(p Parser, fileType string, parser Parser) {
if pm, ok := p.(*rag.ParserManager); ok {
pm.AddParser(fileType, parser)
}
}
// TextParser returns a new parser for plain text files.
// The text parser:
// - Reads the entire file content
// - Preserves text formatting
// - Handles various encodings
// - Provides basic metadata
//
// Example:
//
// parser := TextParser()
// doc, err := parser.Parse("document.txt")
func TextParser() Parser {
return rag.NewTextParser()
}
// PDFParser returns a new parser for PDF documents.
// The PDF parser:
// - Extracts text content from all pages
// - Maintains text order
// - Handles complex PDF structures
// - Provides document metadata
//
// Example:
//
// parser := PDFParser()
// doc, err := parser.Parse("document.pdf")
func PDFParser() Parser {
return rag.NewPDFParser()
}