Skip to content

Commit 1d1ed17

Browse files
authored
Merge pull request #5 from RemiKalbe/exit-on-non-utf8-files
Exit on non utf8 files
2 parents ac4ce9b + cb69842 commit 1d1ed17

File tree

4 files changed

+52
-3
lines changed

4 files changed

+52
-3
lines changed

CHANGELOG.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.3.0] - 2024-11-30
9+
10+
### Added
11+
12+
- Non UTF-8 files are ignored by default, can be disabled with the `--exit-on-non-utf8` flag. (fix [#3](https://github.com/RemiKalbe/cunw/issues/3))
13+
814
## [0.2.3] - 2024-09-19
915

1016
### Fixed
@@ -88,6 +94,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
8894

8995
- Minor improvements to error handling and logging
9096

91-
[Unreleased]: https://github.com/RemiKalbe/cunw/compare/v0.2.0...HEAD
97+
[Unreleased]: https://github.com/RemiKalbe/cunw/compare/v0.3.0...HEAD
98+
[0.3.0]: https://github.com/RemiKalbe/cunw/releases/tag/v0.3.0
99+
[0.2.3]: https://github.com/RemiKalbe/cunw/releases/tag/v0.2.3
100+
[0.2.2]: https://github.com/RemiKalbe/cunw/releases/tag/v0.2.2
101+
[0.2.1]: https://github.com/RemiKalbe/cunw/releases/tag/v0.2.1
92102
[0.2.0]: https://github.com/RemiKalbe/cunw/releases/tag/v0.2.0
93103
[0.1.0]: https://github.com/RemiKalbe/cunw/releases/tag/v0.1.0

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# cunw
22

3-
![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/RemiKalbe/cunw/.github%2Fworkflows%2Ftests.yaml)
4-
![Crates.io Version](https://img.shields.io/crates/v/cunw)
3+
![GitHub Actions Workflow Status](https://github.com/RemiKalbe/cunw/actions)
4+
![Crates.io Version](https://crates.io/crates/cunw)
55

66
cunw (codebase unwrap) is a command-line interface (CLI) tool that generates a structured representation of a codebase, making it easy to provide context to a large language model (LLM). It recursively traverses a directory, collects file content, and generates a single file that represents the structure and content of the codebase.
77

src/args.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ pub struct Args {
1313
pub output: Option<PathBuf>,
1414
#[arg(short, long, help = "Exclude files or directories matching the specified pattern.", value_hint = ValueHint::Other, required = false, num_args = 0.., action = ArgAction::Append)]
1515
pub exclude: Option<Vec<Glob>>,
16+
#[arg(
17+
long,
18+
help = "Exit on non-UTF-8 content.",
19+
required = false,
20+
default_value = "false"
21+
)]
22+
pub exit_on_non_utf8: bool,
1623
#[arg(
1724
long,
1825
help = "Do not consider the ignore files (.gitignore, .hgignore, .ignore, .git/info/exclude and core.excludesFile in .git/config).",
@@ -52,6 +59,7 @@ mod tests {
5259
assert_eq!(args.path.to_str().unwrap(), "/path/to/codebase");
5360
assert_eq!(args.output, Some(std::path::PathBuf::from("output.txt")));
5461
assert_eq!(args.exclude, None);
62+
assert_eq!(args.exit_on_non_utf8, false);
5563
assert_eq!(args.do_not_consider_ignore_files, false);
5664
assert_eq!(args.dangerously_allow_dot_git_traversal, false);
5765
assert_eq!(args.max_depth, None);
@@ -67,6 +75,7 @@ mod tests {
6775
"custom_output.md",
6876
"-e",
6977
"*.txt",
78+
"--exit-on-non-utf8",
7079
"--do-not-consider-ignore-files",
7180
"--dangerously-allow-dot-git-traversal",
7281
"-m",
@@ -80,6 +89,7 @@ mod tests {
8089
Some(std::path::PathBuf::from("custom_output.md"))
8190
);
8291
assert_eq!(args.exclude.unwrap()[0].glob(), "*.txt");
92+
assert_eq!(args.exit_on_non_utf8, true);
8393
assert_eq!(args.do_not_consider_ignore_files, true);
8494
assert_eq!(args.dangerously_allow_dot_git_traversal, true);
8595
assert_eq!(args.max_depth, Some(3));

src/codebase/mod.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pub mod item;
1616

1717
pub struct CodebaseBuilder {
1818
excluded_paths: Option<GlobSet>,
19+
exit_on_non_utf8: Option<bool>,
1920
consider_gitignores: Option<bool>,
2021
max_depth: Option<usize>,
2122
follow_symlinks: Option<bool>,
@@ -26,6 +27,7 @@ impl CodebaseBuilder {
2627
pub fn new() -> Self {
2728
Self {
2829
excluded_paths: None,
30+
exit_on_non_utf8: None,
2931
consider_gitignores: None,
3032
max_depth: None,
3133
follow_symlinks: None,
@@ -38,6 +40,11 @@ impl CodebaseBuilder {
3840
self
3941
}
4042

43+
pub fn exit_on_non_utf8(mut self, exit_on_non_utf8: bool) -> Self {
44+
self.exit_on_non_utf8 = Some(exit_on_non_utf8);
45+
self
46+
}
47+
4148
pub fn consider_gitignores(mut self, consider_gitignores: bool) -> Self {
4249
self.consider_gitignores = Some(consider_gitignores);
4350
self
@@ -205,8 +212,17 @@ impl CodebaseBuilder {
205212

206213
// Wait for all files to be read
207214
let mut any_error = false;
215+
let mut non_utf8_errors = Vec::new();
208216
while let Some(res) = files_handles.next().await {
209217
if let Err(err) = res.expect("Failed to await file content") {
218+
if !self.exit_on_non_utf8.unwrap_or(false) {
219+
if let CunwErrorKind::Io(io_err) = &err.source {
220+
if io_err.kind() == std::io::ErrorKind::InvalidData {
221+
non_utf8_errors.push(err);
222+
continue;
223+
}
224+
}
225+
}
210226
Logger::warn(format!("Error while reading file: {:#?}", err).as_str());
211227
any_error = true;
212228
}
@@ -216,6 +232,19 @@ impl CodebaseBuilder {
216232
"Failed to read file(s) content(s)".to_string(),
217233
)));
218234
}
235+
if !non_utf8_errors.is_empty() {
236+
Logger::warn(
237+
"Some files were ignored because they were not UTF-8 encoded and could not be read.",
238+
);
239+
for err in non_utf8_errors {
240+
if let Some(file) = err.related_to_file {
241+
Logger::warn(format!(" - {}", file.display()).as_str());
242+
}
243+
}
244+
Logger::warn(
245+
"If you want to exit on non-UTF-8 files, use the --exit-on-non-utf8 flag.",
246+
);
247+
}
219248

220249
Ok(Codebase { tree: root_tree })
221250
}

0 commit comments

Comments
 (0)