Skip to content

Commit

Permalink
PDF Renderer: allow to specify an alternate image or resolution progr…
Browse files Browse the repository at this point in the history
…ammatically

Support new rendering_dpi api params.
Add pdf renderer tests.
Install pdf font in cmake tool chain.

resolves tesseract-ocr#210
resolves tesseract-ocr#3798
  • Loading branch information
phymbert authored and stweil committed Apr 19, 2024
1 parent 577e8a8 commit 369aa78
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 3 deletions.
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ option(DISABLE_TIFF "Disable build with libtiff (if available)" OFF)
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
option(DISABLE_CURL "Disable build with libcurl (if available)" OFF)
option(INSTALL_CONFIGS "Install tesseract configs" ON)
option(INSTALL_PDF_TTF "Install pdf font file" ON)

if(NOT ${CMAKE_VERSION} VERSION_LESS "3.15.0")
if(WIN32 AND MSVC)
Expand Down Expand Up @@ -555,6 +556,8 @@ message(STATUS "Build tests [BUILD_TESTS]: ${BUILD_TESTS}")
message(STATUS "Use system ICU Library [USE_SYSTEM_ICU]: ${USE_SYSTEM_ICU}")
message(
STATUS "Install tesseract configs [INSTALL_CONFIGS]: ${INSTALL_CONFIGS}")
message(
STATUS "Install tesseract pdf font [INSTALL_PDF_TTF]: ${INSTALL_PDF_TTF}")
message(STATUS "--------------------------------------------------------")
message(STATUS)

Expand Down Expand Up @@ -962,6 +965,10 @@ if(INSTALL_CONFIGS)
install(FILES ${TESSERACT_TESSCONFIGS}
DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata/tessconfigs)
endif()
if (INSTALL_PDF_TTF)
install(FILES tessdata/pdf.ttf
DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata)
endif ()

# ##############################################################################
# uninstall target
Expand Down
5 changes: 5 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,7 @@ check_PROGRAMS += paragraphs_test
if !DISABLED_LEGACY_ENGINE
check_PROGRAMS += params_model_test
endif # !DISABLED_LEGACY_ENGINE
check_PROGRAMS += pdfrenderer_test
check_PROGRAMS += progress_test
check_PROGRAMS += qrsequence_test
check_PROGRAMS += recodebeam_test
Expand Down Expand Up @@ -1469,6 +1470,10 @@ progress_test_CPPFLAGS = $(unittest_CPPFLAGS)
progress_test_LDFLAGS = $(LEPTONICA_LIBS)
progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

pdfrenderer_test_SOURCES = unittest/pdfrenderer_test.cc
pdfrenderer_test_CPPFLAGS = $(unittest_CPPFLAGS)
pdfrenderer_test_LDADD = $(TESS_LIBS) $(TRAINING_LIBS)

qrsequence_test_SOURCES = unittest/qrsequence_test.cc
qrsequence_test_CPPFLAGS = $(unittest_CPPFLAGS)
qrsequence_test_LDADD = $(TESS_LIBS)
Expand Down
34 changes: 34 additions & 0 deletions include/tesseract/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,23 @@ class TESS_API TessResultRenderer {
return imagenum_;
}

/**
* Specifies an alternate image to render with the extracted text.
* It must be called after BeginDocument and before AddImage.
*/
void SetRenderingImage(Pix *rendering_image) {
rendering_image_ = rendering_image;
}

/**
* Specifies the expected rendering resolution.
* If not set, rendering_dpi api params will be used, else the source image
* resolution.
*/
void SetRenderingResolution(int rendering_dpi) {
rendering_dpi_ = rendering_dpi;
}

protected:
/**
* Called by concrete classes.
Expand Down Expand Up @@ -139,6 +156,21 @@ class TESS_API TessResultRenderer {
// This method will grow the output buffer if needed.
void AppendData(const char *s, int len);

// Renderers can call this to get the actual image to render with extracted
// text. This method returns:
// - the rendering image set by the caller or
// - the input image scaled to the rendering_dpi field if defined or
// - the input image from the api otherwise
Pix *GetRenderingImage(TessBaseAPI *api);

// Resolution of the rendering image either set manually by the caller or with
// the rendering_dpi api parameter.
int GetRenderingResolution(TessBaseAPI *api);

// Reset rendering image and dpi to previous state. Destroy scaled rendered
// image if exists.
void ResetRenderingState(Pix *rendering_image_prev, int rendering_dpi_prev);

template <typename T>
auto AppendData(T &&d) {
AppendData(d.data(), d.size());
Expand All @@ -151,6 +183,8 @@ class TESS_API TessResultRenderer {
const char *file_extension_; // standard extension for generated output
std::string title_; // title of document being rendered
int imagenum_; // index of last image added
Pix *rendering_image_; // Image to render with the extracted text
int rendering_dpi_; // Resolution of the rendering_image
bool happy_; // I get grumpy when the disk fills up, etc.
};

Expand Down
13 changes: 10 additions & 3 deletions src/api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,12 @@ static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
}

char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
double ppi = api->GetSourceYResolution();
double input_image_ppi = api->GetSourceYResolution();
double ppi = GetRenderingResolution(api);
double scale = 1;
if (input_image_ppi > 0) {
scale = ppi / input_image_ppi;
}

// These initial conditions are all arbitrary and will be overwritten
double old_x = 0.0, old_y = 0.0;
Expand Down Expand Up @@ -379,6 +384,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
int x1, y1, x2, y2;
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
x1 *= scale; y1 *= scale; x2 *= scale; y2 *= scale;
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
}

Expand Down Expand Up @@ -413,6 +419,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double
{
int word_x1, word_y1, word_x2, word_y2;
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
word_x1 *= scale; word_y1 *= scale; word_x2 *= scale; word_y2 *= scale;
GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
line_y1, line_x2, line_y2, &x, &y, &word_length);
}
Expand Down Expand Up @@ -828,9 +835,9 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int obj
}

bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
Pix *pix = api->GetInputImage();
Pix *pix = GetRenderingImage(api);
const char *filename = api->GetInputName();
int ppi = api->GetSourceYResolution();
int ppi = GetRenderingResolution(api);
if (!pix || ppi <= 0) {
return false;
}
Expand Down
62 changes: 62 additions & 0 deletions src/api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include <allheaders.h>
#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>
#include <cstring>
#include <memory> // std::unique_ptr
#include <string> // std::string
#include "serialis.h" // Serialize
#include "tprintf.h"

namespace tesseract {

Expand All @@ -36,6 +38,8 @@ TessResultRenderer::TessResultRenderer(const char *outputbase, const char *exten
, file_extension_(extension)
, title_("")
, imagenum_(-1)
, rendering_image_(nullptr)
, rendering_dpi_(0)
, happy_(true) {
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
std::string outfile = std::string(outputbase) + "." + extension;
Expand Down Expand Up @@ -90,13 +94,71 @@ bool TessResultRenderer::AddImage(TessBaseAPI *api) {
return false;
}
++imagenum_;
Pix *rendering_image_prev = rendering_image_;
int rendering_dpi_prev = rendering_dpi_;
bool ok = AddImageHandler(api);
ResetRenderingState(rendering_image_prev, rendering_dpi_prev);
if (next_) {
ok = next_->AddImage(api) && ok;
}
return ok;
}

void TessResultRenderer::ResetRenderingState(Pix *rendering_image_prev,
int rendering_dpi_prev) {
if (rendering_image_ != rendering_image_prev) {
pixDestroy(&rendering_image_);
rendering_image_ = rendering_image_prev;
}
if (rendering_dpi_ != rendering_dpi_prev) {
rendering_dpi_ = rendering_dpi_prev;
}
}

Pix *TessResultRenderer::GetRenderingImage(TessBaseAPI *api) {
if (!rendering_image_) {
Pix *source_image = api->GetInputImage();
int source_dpi = api->GetSourceYResolution();
if (!source_image || source_dpi <= 0) {
happy_ = false;
return nullptr;
}

int rendering_dpi = GetRenderingResolution(api);
if (rendering_dpi != source_dpi) {
float scale = (float)rendering_dpi / (float)source_dpi;

rendering_image_ = pixScale(source_image, scale, scale);
} else {
return source_image;
}
}
return rendering_image_;
}

int TessResultRenderer::GetRenderingResolution(tesseract::TessBaseAPI *api) {
if (rendering_dpi_) {
return rendering_dpi_;
}
int source_dpi = api->GetSourceYResolution();
int rendering_dpi;
if (api->GetIntVariable("rendering_dpi", &rendering_dpi) &&
rendering_dpi > 0 && rendering_dpi != source_dpi) {
if (rendering_dpi < kMinCredibleResolution ||
rendering_dpi > kMaxCredibleResolution) {
#if !defined(NDEBUG)
tprintf(
"Warning: User defined rendering dpi %d is outside of expected range "
"(%d - %d)!\n",
rendering_dpi, kMinCredibleResolution, kMaxCredibleResolution);
#endif
}
rendering_dpi_ = rendering_dpi;
return rendering_dpi_;
}
return source_dpi;
}

bool TessResultRenderer::EndDocument() {
if (!happy_) {
return false;
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ Tesseract::Tesseract()
, BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
this->params())
, INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
, INT_MEMBER(rendering_dpi, 0, "Scaled input image resolution before rendering", this->params())
, INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
, INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
this->params())
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_pdf);
BOOL_VAR_H(textonly_pdf);
INT_VAR_H(jpg_quality);
INT_VAR_H(rendering_dpi);
INT_VAR_H(user_defined_dpi);
INT_VAR_H(min_characters_to_try);
STRING_VAR_H(unrecognised_char);
Expand Down
139 changes: 139 additions & 0 deletions unittest/pdfrenderer_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// (C) Copyright 2023, Tesseract Contributors.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <allheaders.h>
#include <tesseract/renderer.h>
#include <filesystem>
#include <string>

#include "include_gunit.h"

namespace tesseract {

static std::map<std::string, std::string> userdefined_dpi_variables = {
{"user_defined_dpi", "300"}};

class TessPDFRendererTest : public testing::Test {
protected:
static std::string TestDataNameToPath(const std::string &name) {
return file::JoinPath(TESTING_DIR, name);
}
static std::string TessdataPath() {
return TESSDATA_DIR;
}
static std::string TestPDFName(const std::string &suffix) {
return "/tmp/tesseract_pdf_renderer_test_phottest" + suffix;
}

static void AssertPDFSizeLT(const std::string &filename, int size) {
std::filesystem::path p = filename + ".pdf";
ASSERT_LT(std::filesystem::file_size(p), size);
}

static void AssertPDFRemove(const std::string &filename) {
ASSERT_EQ(std::remove((filename + ".pdf").c_str()), 0);
}

static bool initializeAPI(
TessBaseAPI &api, const std::map<std::string, std::string> &variables) {
EXPECT_EQ(api.Init(TESSDATA_DIR, "eng", OEM_LSTM_ONLY), 0);
for (const auto &[name, value] : variables) {
api.SetVariable(name.c_str(), value.c_str());
}
return true;
}

static bool ProcessAndRenderPages(
const std::string &input_filename, TessPDFRenderer *pdf_renderer,
const std::map<std::string, std::string> &variables) {
TessBaseAPI api;
initializeAPI(api, variables);
auto testdata_input_filename = TestDataNameToPath(input_filename);
EXPECT_TRUE(api.ProcessPages(testdata_input_filename.c_str(), TESSDATA_DIR,
1000, pdf_renderer));
api.End();
return pdf_renderer->happy();
}

static void RenderPDFAndAssertSize(
const std::string &image_file, const std::string &pdf_suffix,
bool text_only, int max_file_size,
const std::map<std::string, std::string> &variables = {}) {
auto pdf_name = TestPDFName(pdf_suffix);
auto pdf_renderer = std::make_unique<TessPDFRenderer>(
pdf_name.c_str(), "tessdata", text_only);
ASSERT_TRUE(
ProcessAndRenderPages(image_file, pdf_renderer.get(), variables));
AssertPDFSizeLT(pdf_name, max_file_size);
AssertPDFRemove(pdf_name);
}
};

// Test basic pdf rendering
TEST_F(TessPDFRendererTest, TestPDFRenderBasicTest) {
RenderPDFAndAssertSize("phototest_2.tif", "", false, 113000);
}

// Test pdf rendering with lower jpeg quality
TEST_F(TessPDFRendererTest, TestPDFRenderJPEGQualityTest) {
static std::map<std::string, std::string> variables = {{"jpg_quality", "40"}};
RenderPDFAndAssertSize("phototest_2.tif", "jpg_quality", false, 66000,
variables);
}

// Test pdf renderer text only
TEST_F(TessPDFRendererTest, TestPDFRenderTextOnlyTest) {
RenderPDFAndAssertSize("phototest_2.tif", "text_only", true, 3500);
}

// Test that pdf renderer generates a custom image resolution in the pdf export
TEST_F(TessPDFRendererTest, TestPDFRenderLowerResolutionTest) {
std::string pdf_name = TestPDFName("lower_resolution");
auto pdf_renderer =
std::make_unique<TessPDFRenderer>(pdf_name.c_str(), "tessdata", false);
pdf_renderer->SetRenderingResolution(110);
CHECK_OK(ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(),
userdefined_dpi_variables));
AssertPDFSizeLT(pdf_name, 35000);
AssertPDFRemove(pdf_name);
}

// Test that pdf renderer generates a custom image resolution in the pdf export
// with variable directive
TEST_F(TessPDFRendererTest, TestPDFLowerResolutionVariableTest) {
std::string pdf_name = TestPDFName("lower_resolution_variable");
static std::map<std::string, std::string> variables = {
{"rendering_dpi", "110"}};
variables.insert(begin(userdefined_dpi_variables),
end(userdefined_dpi_variables));
auto pdf_renderer =
std::make_unique<TessPDFRenderer>(pdf_name.c_str(), "tessdata", false);
CHECK_OK(
ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(), variables));
AssertPDFSizeLT(pdf_name, 35000);
AssertPDFRemove(pdf_name);
}

// Test that pdf renderer generates an alternate image in the pdf export
TEST_F(TessPDFRendererTest, TestPDFAlternateImageTest) {
std::string pdf_name = TestPDFName("alternate_image");
auto pdf_renderer =
std::make_unique<TessPDFRenderer>(pdf_name.c_str(), "tessdata", false);
auto alternate_image = pixRead(TestDataNameToPath("phototest.tif").c_str());
pdf_renderer->SetRenderingImage(alternate_image);
CHECK_OK(ProcessAndRenderPages("phototest_2.tif", pdf_renderer.get(),
std::map<std::string, std::string>()));
pixDestroy(&alternate_image);
AssertPDFSizeLT(pdf_name, 8000);
AssertPDFRemove(pdf_name);
}

} // namespace tesseract

0 comments on commit 369aa78

Please sign in to comment.