Skip to content

Commit c981519

Browse files
authored
General improvements (#4)
* Add assertions to the tests This way, we can verify they work * Require mutable references for tesseract This way, we prevent changing the tesseract object unless you have a mutable reference to it * Remove the libc dependency It isn't required * Add myself as an author * Follow Cargo clippy suggestions * Investigate memory safety We should free the image after we pass it into Tesseract. It makes its own copy. https://tesseract-ocr.github.io/4.0.0/a01625.html#ga0c4c7f05fd58b3665b123232a05545ad I'm not sure how to free the memory from get_text. Advice or pull requests around this will be appreciated. * Link to tesseract's github * Bump to 0.2.0 We're allowed to make breaking changes in the before 1.0.0 releases. This is a breaking change because many of the methods now require a mutable pointer.
1 parent 9d04add commit c981519

File tree

4 files changed

+35
-20
lines changed

4 files changed

+35
-20
lines changed

Cargo.toml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
[package]
22
name = "tesseract"
3-
version = "0.1.0"
4-
authors = ["Kevin Kwok <[email protected]>"]
3+
version = "0.2.0"
4+
authors = ["Kevin Kwok <[email protected]>", "Chris Couzens <[email protected]>"]
55
documentation = "https://docs.rs/tesseract"
66
repository = "https://github.com/antimatter15/tesseract-rs"
77
description = "Higher-level bindings for Tesseract OCR"
88
license = "MIT"
99
keywords = ["tesseract", "OCR", "bindings"]
10-
10+
categories = ["api-bindings", "multimedia::images"]
1111

1212
[dependencies]
13-
libc = "0.1"
1413
leptonica-sys = "0.1.0"
1514

1615
[dependencies.tesseract-sys]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# tesseract-rs
2-
Rust bindings for Tesseract
2+
Rust bindings for [Tesseract](https://github.com/tesseract-ocr/tesseract)

src/lib.rs

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
extern crate leptonica_sys;
2-
extern crate libc;
32
extern crate tesseract_sys;
43

5-
use leptonica_sys::pixRead;
4+
use leptonica_sys::{pixRead, pixFreeData};
65
use std::ffi::CStr;
76
use std::ffi::CString;
87
use std::ptr;
@@ -18,11 +17,16 @@ pub struct Tesseract {
1817

1918
impl Drop for Tesseract {
2019
fn drop(&mut self) {
21-
println!("Ave Imperator! Nos morituri te salutamus.");
2220
unsafe { TessBaseAPIDelete(self.raw) }
2321
}
2422
}
2523

24+
impl Default for Tesseract {
25+
fn default() -> Self {
26+
Self::new()
27+
}
28+
}
29+
2630
fn cs(string: &str) -> CString {
2731
// do not call as_ptr yet, since the data will be freed before we return
2832
CString::new(string).unwrap()
@@ -34,50 +38,62 @@ impl Tesseract {
3438
raw: unsafe { TessBaseAPICreate() },
3539
}
3640
}
37-
pub fn set_lang(&self, language: &str) -> i32 {
41+
pub fn set_lang(&mut self, language: &str) -> i32 {
3842
let cs_language = cs(language);
3943
unsafe { TessBaseAPIInit3(self.raw, ptr::null(), cs_language.as_ptr()) }
4044
}
41-
pub fn set_image(&self, filename: &str) {
45+
pub fn set_image(&mut self, filename: &str) {
4246
let cs_filename = cs(filename);
4347
unsafe {
4448
let img = pixRead(cs_filename.as_ptr());
4549
TessBaseAPISetImage2(self.raw, img);
50+
pixFreeData(img);
4651
}
4752
}
48-
pub fn set_variable(&self, name: &str, value: &str) -> i32 {
53+
pub fn set_variable(&mut self, name: &str, value: &str) -> i32 {
4954
let cs_name = cs(name);
5055
let cs_value = cs(value);
5156
unsafe { TessBaseAPISetVariable(self.raw, cs_name.as_ptr(), cs_value.as_ptr()) }
5257
}
53-
pub fn recognize(&self) -> i32 {
58+
pub fn recognize(&mut self) -> i32 {
5459
unsafe { TessBaseAPIRecognize(self.raw, ptr::null_mut()) }
5560
}
5661
pub fn get_text(&self) -> &str {
62+
// I think this leaks memory
63+
// The c++ documentation
64+
// https://tesseract-ocr.github.io/4.0.0/a01625.html#ga115ef656f83352ba608b4f0bf9cfa2c4
65+
// says memory must be freed "with the delete [] operator".
66+
// The c documentation doesn't say much
67+
// https://tesseract-ocr.github.io/4.0.0/a00014.html#a624731fab8a0107a6949195f62d63710
68+
// But the code directly calls the c++ function.
69+
// Rust can't call `delete []`, so I'm not sure how rust is meant to free this string.
5770
unsafe {
5871
str::from_utf8(CStr::from_ptr(TessBaseAPIGetUTF8Text(self.raw)).to_bytes()).unwrap()
5972
}
6073
}
6174
}
6275

6376
pub fn ocr(filename: &str, language: &str) -> String {
64-
let cube = Tesseract::new();
77+
let mut cube = Tesseract::new();
6578
cube.set_lang(language);
6679
cube.set_image(filename);
6780
cube.recognize();
68-
return cube.get_text().to_string();
81+
cube.get_text().to_string()
6982
}
7083

7184
#[test]
72-
fn blah() {
73-
ocr("img.png", "eng");
85+
fn ocr_test() {
86+
assert_eq!(
87+
ocr("img.png", "eng"),
88+
include_str!("../img.txt").to_string()
89+
);
7490
}
7591

7692
#[test]
77-
fn it_works() {
78-
let cube = Tesseract::new();
93+
fn expanded_test() {
94+
let mut cube = Tesseract::new();
7995
cube.set_lang("eng");
8096
cube.set_image("img.png");
8197
cube.recognize();
82-
println!("{:?}", cube.get_text());
98+
assert_eq!(cube.get_text(), include_str!("../img.txt").to_string())
8399
}

wrinkle/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use tesseract::*;
33

44
fn main() {
55
println!("Hello, world!");
6-
let cube = Tesseract::new();
6+
let mut cube = Tesseract::new();
77
let filename = "img.png";
88
let language = "eng";
99
cube.set_lang(language);

0 commit comments

Comments
 (0)