Skip to content

Commit

Permalink
Merge pull request #3517 from stweil/alto
Browse files Browse the repository at this point in the history
Write image filename in ALTO output and reduce size of renderer classes
  • Loading branch information
egorpugin authored Aug 7, 2021
2 parents 95223cf + 16fd143 commit 3178c49
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 18 deletions.
7 changes: 5 additions & 2 deletions include/tesseract/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,12 @@ class TESS_API TessResultRenderer {
void AppendData(const char *s, int len);

private:
TessResultRenderer *next_; // Can link multiple renderers together
FILE *fout_; // output file pointer
const char *file_extension_; // standard extension for generated output
std::string title_; // title of document being rendered
int imagenum_; // index of last image added

FILE *fout_; // output file pointer
TessResultRenderer *next_; // Can link multiple renderers together
bool happy_; // I get grumpy when the disk fills up, etc.
};

Expand Down Expand Up @@ -189,6 +189,9 @@ class TESS_API TessAltoRenderer : public TessResultRenderer {
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;

private:
bool begin_document;
};

/**
Expand Down
32 changes: 19 additions & 13 deletions src/api/altorenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,17 @@ static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
/// Append the ALTO XML for the beginning of the document
///
bool TessAltoRenderer::BeginDocumentHandler() {
AppendString(
// Delay the XML output because we need the name of the image file.
begin_document = true;
return true;
}

///
/// Append the ALTO XML for the layout of the image
///
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
if (begin_document) {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
Expand All @@ -67,31 +77,26 @@ bool TessAltoRenderer::BeginDocumentHandler() {
"\t\t<sourceImageInformation>\n"
"\t\t\t<fileName>");

AppendString(title());
AppendString(api->GetInputName());

AppendString(
AppendString(
"</fileName>\n"
"\t\t</sourceImageInformation>\n"
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
"\t\t\t<ocrProcessingStep>\n"
"\t\t\t\t<processingSoftware>\n"
"\t\t\t\t\t<softwareName>tesseract ");
AppendString(TessBaseAPI::Version());
AppendString(
AppendString(TessBaseAPI::Version());
AppendString(
"</softwareName>\n"
"\t\t\t\t</processingSoftware>\n"
"\t\t\t</ocrProcessingStep>\n"
"\t\t</OCRProcessing>\n"
"\t</Description>\n"
"\t<Layout>\n");
begin_document = false;
}

return true;
}

///
/// Append the ALTO XML for the layout of the image
///
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
if (text == nullptr) {
return false;
Expand All @@ -112,7 +117,8 @@ bool TessAltoRenderer::EndDocumentHandler() {
}

TessAltoRenderer::TessAltoRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "xml") {}
: TessResultRenderer(outputbase, "xml"),
begin_document(false) {}

///
/// Make an XML-formatted string with ALTO markup from the internal
Expand Down
6 changes: 3 additions & 3 deletions src/api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ namespace tesseract {
* Base Renderer interface implementation
**********************************************************************/
TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
: file_extension_(extension)
: next_(nullptr)
, fout_(stdout)
, file_extension_(extension)
, title_("")
, imagenum_(-1)
, fout_(stdout)
, next_(nullptr)
, happy_(true) {
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
std::string outfile = std::string(outputbase) + "." + extension;
Expand Down

0 comments on commit 3178c49

Please sign in to comment.