Skip to content

Commit f81bbc9

Browse files
committed
Add ability to get HOCR data
Will do the same in leptess this afternoon
1 parent 34e574e commit f81bbc9

File tree

5 files changed

+113
-3
lines changed

5 files changed

+113
-3
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tesseract"
3-
version = "0.6.2"
3+
version = "0.7.0"
44
authors = ["Kevin Kwok <[email protected]>", "Chris Couzens <[email protected]>"]
55
documentation = "https://docs.rs/tesseract"
66
repository = "https://github.com/antimatter15/tesseract-rs"

img.html

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
<div class='ocr_page' id='page_1' title='image ""; bbox 0 0 2256 324; ppageno 0'>
2+
<div class='ocr_carea' id='block_1_1' title="bbox 0 17 2206 314">
3+
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 0 17 2206 314">
4+
<span class='ocr_line' id='line_1_1' title="bbox 5 17 2202 70; baseline 0 -10; x_size 53; x_descenders 10; x_ascenders 13">
5+
<span class='ocrx_word' id='word_1_1' title='bbox 5 19 237 60; x_wconf 96'>Hundreds</span>
6+
<span class='ocrx_word' id='word_1_2' title='bbox 256 19 306 60; x_wconf 96'>of</span>
7+
<span class='ocrx_word' id='word_1_3' title='bbox 320 17 581 70; x_wconf 96'>companies</span>
8+
<span class='ocrx_word' id='word_1_4' title='bbox 599 19 768 60; x_wconf 96'>around</span>
9+
<span class='ocrx_word' id='word_1_5' title='bbox 787 19 866 60; x_wconf 96'>the</span>
10+
<span class='ocrx_word' id='word_1_6' title='bbox 884 19 1016 60; x_wconf 95'>world</span>
11+
<span class='ocrx_word' id='word_1_7' title='bbox 1037 30 1111 60; x_wconf 96'>are</span>
12+
<span class='ocrx_word' id='word_1_8' title='bbox 1133 17 1259 70; x_wconf 96'>using</span>
13+
<span class='ocrx_word' id='word_1_9' title='bbox 1278 22 1381 60; x_wconf 96'>Rust</span>
14+
<span class='ocrx_word' id='word_1_10' title='bbox 1400 17 1438 59; x_wconf 96'>in</span>
15+
<span class='ocrx_word' id='word_1_11' title='bbox 1462 17 1724 70; x_wconf 96'>production</span>
16+
<span class='ocrx_word' id='word_1_12' title='bbox 1743 19 1880 70; x_wconf 96'>today</span>
17+
<span class='ocrx_word' id='word_1_13' title='bbox 1896 19 1964 60; x_wconf 96'>for</span>
18+
<span class='ocrx_word' id='word_1_14' title='bbox 1980 19 2081 68; x_wconf 93'>fast,</span>
19+
<span class='ocrx_word' id='word_1_15' title='bbox 2103 19 2202 60; x_wconf 90'>low-</span>
20+
</span>
21+
<span class='ocr_line' id='line_1_2' title="bbox 5 98 2206 151; baseline -0 -10; x_size 51; x_descenders 9; x_ascenders 12">
22+
<span class='ocrx_word' id='word_1_16' title='bbox 5 111 223 149; x_wconf 96'>resource,</span>
23+
<span class='ocrx_word' id='word_1_17' title='bbox 243 100 598 151; x_wconf 95'>cross-platform</span>
24+
<span class='ocrx_word' id='word_1_18' title='bbox 619 98 855 141; x_wconf 96'>solutions.</span>
25+
<span class='ocrx_word' id='word_1_19' title='bbox 874 100 1086 141; x_wconf 96'>Software</span>
26+
<span class='ocrx_word' id='word_1_20' title='bbox 1104 111 1187 151; x_wconf 96'>you</span>
27+
<span class='ocrx_word' id='word_1_21' title='bbox 1211 100 1335 141; x_wconf 96'>know</span>
28+
<span class='ocrx_word' id='word_1_22' title='bbox 1352 100 1438 141; x_wconf 94'>and</span>
29+
<span class='ocrx_word' id='word_1_23' title='bbox 1462 100 1569 149; x_wconf 96'>love,</span>
30+
<span class='ocrx_word' id='word_1_24' title='bbox 1591 98 1671 141; x_wconf 96'>like</span>
31+
<span class='ocrx_word' id='word_1_25' title='bbox 1694 98 1866 149; x_wconf 96'>Firefox,</span>
32+
<span class='ocrx_word' id='word_1_26' title='bbox 1889 100 2101 151; x_wconf 96'>Dropbox,</span>
33+
<span class='ocrx_word' id='word_1_27' title='bbox 2120 100 2206 141; x_wconf 96'>and</span>
34+
</span>
35+
<span class='ocr_line' id='line_1_3' title="bbox 3 176 2095 233; baseline 0 -12; x_size 53; x_descenders 11; x_ascenders 11">
36+
<span class='ocrx_word' id='word_1_28' title='bbox 3 181 261 230; x_wconf 92'>Cloudflare,</span>
37+
<span class='ocrx_word' id='word_1_29' title='bbox 283 192 387 222; x_wconf 96'>uses</span>
38+
<span class='ocrx_word' id='word_1_30' title='bbox 408 184 522 222; x_wconf 96'>Rust.</span>
39+
<span class='ocrx_word' id='word_1_31' title='bbox 542 184 660 222; x_wconf 96'>From</span>
40+
<span class='ocrx_word' id='word_1_32' title='bbox 674 185 884 233; x_wconf 95'>startups</span>
41+
<span class='ocrx_word' id='word_1_33' title='bbox 896 185 946 222; x_wconf 95'>to</span>
42+
<span class='ocrx_word' id='word_1_34' title='bbox 962 180 1086 233; x_wconf 95'>large</span>
43+
<span class='ocrx_word' id='word_1_35' title='bbox 1100 176 1430 233; x_wconf 96'>corporations,</span>
44+
<span class='ocrx_word' id='word_1_36' title='bbox 1443 180 1559 222; x_wconf 96'>from</span>
45+
<span class='ocrx_word' id='word_1_37' title='bbox 1574 180 1834 222; x_wconf 96'>embedded</span>
46+
<span class='ocrx_word' id='word_1_38' title='bbox 1850 176 2033 222; x_wconf 96'>devices</span>
47+
<span class='ocrx_word' id='word_1_39' title='bbox 2045 185 2095 222; x_wconf 96'>to</span>
48+
</span>
49+
<span class='ocr_line' id='line_1_4' title="bbox 0 257 987 314; baseline 0 -11; x_size 57; x_descenders 11; x_ascenders 15">
50+
<span class='ocrx_word' id='word_1_40' title='bbox 0 261 204 303; x_wconf 96'>scalable</span>
51+
<span class='ocrx_word' id='word_1_41' title='bbox 217 261 318 303; x_wconf 96'>web</span>
52+
<span class='ocrx_word' id='word_1_42' title='bbox 331 257 546 312; x_wconf 95'>services,</span>
53+
<span class='ocrx_word' id='word_1_43' title='bbox 562 264 671 303; x_wconf 94'>Rust</span>
54+
<span class='ocrx_word' id='word_1_44' title='bbox 684 257 723 303; x_wconf 94'>is</span>
55+
<span class='ocrx_word' id='word_1_45' title='bbox 736 272 763 303; x_wconf 96'>a</span>
56+
<span class='ocrx_word' id='word_1_46' title='bbox 777 266 910 314; x_wconf 88'>great</span>
57+
<span class='ocrx_word' id='word_1_47' title='bbox 921 261 987 303; x_wconf 95'>fit.</span>
58+
</span>
59+
</p>
60+
</div>
61+
</div>

src/lib.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ extern crate thiserror;
33
use self::thiserror::Error;
44
use std::ffi::CString;
55
use std::ffi::NulError;
6+
use std::os::raw::c_int;
67
use std::str;
78

89
pub mod plumbing;
@@ -41,6 +42,8 @@ pub enum TesseractError {
4142
RecognizeError(#[from] plumbing::TessBaseAPIRecogniseError),
4243
#[error("Errored whilst getting text")]
4344
GetTextError(#[from] plumbing::TessBaseAPIGetUTF8TextError),
45+
#[error("Errored whilst getting HOCR text")]
46+
GetHOCRTextError(#[from] plumbing::TessBaseAPIGetHOCRTextError),
4447
#[error("Errored whilst setting frame")]
4548
SetFrameError(#[from] plumbing::TessBaseAPISetImageSafetyError),
4649
#[error("Errored whilst setting image from mem")]
@@ -111,6 +114,21 @@ impl Tesseract {
111114
.to_string_lossy()
112115
.into_owned())
113116
}
117+
118+
/// Get the text encoded as HTML with bounding box tags
119+
///
120+
/// See [img.html](../img.html) for an example.
121+
pub fn get_hocr_text(
122+
&mut self,
123+
page: c_int,
124+
) -> Result<String, plumbing::TessBaseAPIGetHOCRTextError> {
125+
Ok(self
126+
.0
127+
.get_hocr_text(page)?
128+
.as_ref()
129+
.to_string_lossy()
130+
.into_owned())
131+
}
114132
}
115133

116134
pub fn ocr(filename: &str, language: &str) -> Result<String, TesseractError> {
@@ -170,3 +188,10 @@ fn expanded_test() -> Result<(), TesseractError> {
170188
assert_eq!(&cube.get_text()?, include_str!("../img.txt"));
171189
Ok(())
172190
}
191+
192+
#[test]
193+
fn hocr_test() -> Result<(), TesseractError> {
194+
let mut cube = Tesseract::new(None, Some("eng"))?.set_image("img.png")?;
195+
assert_eq!(&cube.get_hocr_text(0)?, include_str!("../img.html"));
196+
Ok(())
197+
}

src/plumbing/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub use self::pix::Pix;
1111
pub use self::pix::PixReadError;
1212
pub use self::pix::PixReadMemError;
1313
pub use self::tess_base_api::TessBaseAPI;
14+
pub use self::tess_base_api::TessBaseAPIGetHOCRTextError;
1415
pub use self::tess_base_api::TessBaseAPIGetUTF8TextError;
1516
pub use self::tess_base_api::TessBaseAPIInitError;
1617
pub use self::tess_base_api::TessBaseAPIRecogniseError;

src/plumbing/tess_base_api.rs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ extern crate tesseract_sys;
22
extern crate thiserror;
33

44
use self::tesseract_sys::{
5-
TessBaseAPICreate, TessBaseAPIDelete, TessBaseAPIGetUTF8Text, TessBaseAPIInit3,
6-
TessBaseAPIRecognize, TessBaseAPISetImage, TessBaseAPISetImage2,
5+
TessBaseAPICreate, TessBaseAPIDelete, TessBaseAPIGetHOCRText, TessBaseAPIGetUTF8Text,
6+
TessBaseAPIInit3, TessBaseAPIRecognize, TessBaseAPISetImage, TessBaseAPISetImage2,
77
TessBaseAPISetSourceResolution, TessBaseAPISetVariable,
88
};
99
use self::thiserror::Error;
@@ -40,6 +40,10 @@ pub struct TessBaseAPISetVariableError();
4040
#[error("TessBaseApi failed to recognize")]
4141
pub struct TessBaseAPIRecogniseError();
4242

43+
#[derive(Debug, Error)]
44+
#[error("TessBaseApi get_hocr_text returned null")]
45+
pub struct TessBaseAPIGetHOCRTextError();
46+
4347
#[derive(Debug, Error)]
4448
#[error("TessBaseApi get_utf8_text returned null")]
4549
pub struct TessBaseAPIGetUTF8TextError();
@@ -175,6 +179,25 @@ impl TessBaseAPI {
175179
Ok(unsafe { TesseractText::new(ptr) })
176180
}
177181
}
182+
183+
/// Wrapper for [`GetUTF8Text`](https://tesseract-ocr.github.io/tessapi/5.x/a02438.html#a655f906bbf64dcd6f33ce633ecce997d)
184+
///
185+
/// Get the text out of an image.
186+
///
187+
/// Can return an error (null pointer), but it is not clear to me what would cause this.
188+
///
189+
/// This will implicitly call `recognize` if required.
190+
pub fn get_hocr_text(
191+
&mut self,
192+
page: c_int,
193+
) -> Result<TesseractText, TessBaseAPIGetHOCRTextError> {
194+
let ptr = unsafe { TessBaseAPIGetHOCRText(self.0, page) };
195+
if ptr.is_null() {
196+
Err(TessBaseAPIGetHOCRTextError {})
197+
} else {
198+
Ok(unsafe { TesseractText::new(ptr) })
199+
}
200+
}
178201
}
179202

180203
#[test]

0 commit comments

Comments
 (0)