tesseract  3.05.02
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

Public Member Functions

virtual ~ResultIterator ()
 
virtual void Begin ()
 
virtual bool Next (PageIteratorLevel level)
 
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
 
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~LTRResultIterator ()
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
void RowAttributes (float *row_height, float *descenders, float *ascenders) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
const void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
bool HasTruthString () const
 
bool EquivalentToTruth (const char *str) const
 
char * WordTruthUTF8Text () const
 
char * WordNormedUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
int Cmp (const PageIterator &other) const
 
void SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBox (PageIteratorLevel level, const int padding, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pta * BlockPolygon () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 
bool SetWordBlamerBundle (BlamerBundle *blamer_bundle)
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void BeginWord (int offset)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
bool include_upper_dots_
 
bool include_lower_dots_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Detailed Description

Definition at line 38 of file resultiterator.h.

Constructor & Destructor Documentation

◆ ~ResultIterator()

virtual tesseract::ResultIterator::~ResultIterator ( )
inlinevirtual

ResultIterator is copy constructible! The default copy constructor works just fine for us.

Definition at line 46 of file resultiterator.h.

46 {}

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 33 of file resultiterator.cpp.

34  : LTRResultIterator(resit) {
35  in_minor_direction_ = false;
36  at_beginning_of_minor_run_ = false;
37  preserve_interword_spaces_ = false;
38 
39  BoolParam *p = ParamUtils::FindParam<BoolParam>(
40  "preserve_interword_spaces", GlobalParams()->bool_params,
42  if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
43 
44  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
45  MoveToLogicalStartOfTextline();
46 }
GenericVector< BoolParam * > bool_params
Definition: params.h:45
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
ParamsVectors * params()
Definition: ccutil.h:63

Member Function Documentation

◆ Begin()

void tesseract::ResultIterator::Begin ( )
virtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 413 of file resultiterator.cpp.

413  {
415  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
416  in_minor_direction_ = false;
417  at_beginning_of_minor_run_ = false;
418  MoveToLogicalStartOfTextline();
419 }

◆ CalculateTextlineOrder()

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 255 of file resultiterator.cpp.

258  {
259  reading_order->truncate(0);
260  if (word_dirs.size() == 0) return;
261 
262  // Take all of the runs of minor direction words and insert them
263  // in reverse order.
264  int minor_direction, major_direction, major_step, start, end;
265  if (paragraph_is_ltr) {
266  start = 0;
267  end = word_dirs.size();
268  major_step = 1;
269  major_direction = DIR_LEFT_TO_RIGHT;
270  minor_direction = DIR_RIGHT_TO_LEFT;
271  } else {
272  start = word_dirs.size() - 1;
273  end = -1;
274  major_step = -1;
275  major_direction = DIR_RIGHT_TO_LEFT;
276  minor_direction = DIR_LEFT_TO_RIGHT;
277  // Special rule: if there are neutral words at the right most side
278  // of a line adjacent to a left-to-right word in the middle of the
279  // line, we interpret the end of the line as a single LTR sequence.
280  if (word_dirs[start] == DIR_NEUTRAL) {
281  int neutral_end = start;
282  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
283  neutral_end--;
284  }
285  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
286  // LTR followed by neutrals.
287  // Scan for the beginning of the minor left-to-right run.
288  int left = neutral_end;
289  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
290  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
291  }
292  reading_order->push_back(kMinorRunStart);
293  for (int i = left; i < word_dirs.size(); i++) {
294  reading_order->push_back(i);
295  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
296  }
297  reading_order->push_back(kMinorRunEnd);
298  start = left - 1;
299  }
300  }
301  }
302  for (int i = start; i != end;) {
303  if (word_dirs[i] == minor_direction) {
304  int j = i;
305  while (j != end && word_dirs[j] != major_direction)
306  j += major_step;
307  if (j == end) j -= major_step;
308  while (j != i && word_dirs[j] != minor_direction)
309  j -= major_step;
310  // [j..i] is a minor direction run.
311  reading_order->push_back(kMinorRunStart);
312  for (int k = j; k != i; k -= major_step) {
313  reading_order->push_back(k);
314  }
315  reading_order->push_back(i);
316  reading_order->push_back(kMinorRunEnd);
317  i = j + major_step;
318  } else {
319  reading_order->push_back(i);
320  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
321  i += major_step;
322  }
323  }
324 }
int push_back(T object)
void truncate(int size)
static const int kMinorRunEnd
static const int kMinorRunStart
int size() const
Definition: genericvector.h:72
static const int kComplexWord

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 556 of file resultiterator.cpp.

556  {
557  if (it_->word() == NULL) return NULL; // Already at the end!
558  STRING text;
559  switch (level) {
560  case RIL_BLOCK:
561  {
562  ResultIterator pp(*this);
563  do {
564  pp.AppendUTF8ParagraphText(&text);
565  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
566  }
567  break;
568  case RIL_PARA:
569  AppendUTF8ParagraphText(&text);
570  break;
571  case RIL_TEXTLINE:
572  {
573  ResultIterator it(*this);
574  it.MoveToLogicalStartOfTextline();
575  it.IterateAndAppendUTF8TextlineText(&text);
576  }
577  break;
578  case RIL_WORD:
579  AppendUTF8WordText(&text);
580  break;
581  case RIL_SYMBOL:
582  {
583  bool reading_direction_is_ltr =
584  current_paragraph_is_ltr_ ^ in_minor_direction_;
585  if (at_beginning_of_minor_run_) {
586  text += reading_direction_is_ltr ? kLRM : kRLM;
587  }
588  text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
589  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
590  }
591  break;
592  }
593  int length = text.length() + 1;
594  char* result = new char[length];
595  strncpy(result, text.string(), length);
596  return result;
597 }
WERD_RES * word() const
Definition: pageres.h:736
inT32 length() const
Definition: strngs.cpp:196
const char * kRLM
Definition: unicodes.cpp:28
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:345
const char * string() const
Definition: strngs.cpp:201
BLOCK_RES * block() const
Definition: pageres.h:742
Definition: strngs.h:44
const char * kLRM
Definition: unicodes.cpp:27
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
virtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 496 of file resultiterator.cpp.

496  {
497  if (it_->block() == NULL) return false; // Already at the end!
498  if (it_->word() == NULL) return true; // In an image block.
499  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
500 
501  bool at_word_start = IsAtFirstSymbolOfWord();
502  if (level == RIL_WORD) return at_word_start;
503 
504  ResultIterator line_start(*this);
505  // move to the first word in the line...
506  line_start.MoveToLogicalStartOfTextline();
507 
508  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
509  if (level == RIL_TEXTLINE) return at_textline_start;
510 
511  // now we move to the left-most word...
512  line_start.RestartRow();
513  bool at_block_start = at_textline_start &&
514  line_start.it_->block() != line_start.it_->prev_block();
515  if (level == RIL_BLOCK) return at_block_start;
516 
517  bool at_para_start = at_block_start ||
518  (at_textline_start &&
519  line_start.it_->row()->row->para() !=
520  line_start.it_->prev_row()->row->para());
521  if (level == RIL_PARA) return at_para_start;
522 
523  ASSERT_HOST(false); // shouldn't happen.
524  return false;
525 }
WERD_RES * word() const
Definition: pageres.h:736
BLOCK_RES * block() const
Definition: pageres.h:742
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
virtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 532 of file resultiterator.cpp.

533  {
534  if (Empty(element)) return true; // Already at the end!
535  // The result is true if we step forward by element and find we are
536  // at the the end of the page or at beginning of *all* levels in:
537  // [level, element).
538  // When there is more than one level difference between element and level,
539  // we could for instance move forward one symbol and still be at the first
540  // word on a line, so we also have to be at the first symbol in a word.
541  ResultIterator next(*this);
542  next.Next(element);
543  if (next.Empty(element)) return true; // Reached the end of the page.
544  while (element > level) {
545  element = static_cast<PageIteratorLevel>(element - 1);
546  if (!next.IsAtBeginningOf(element))
547  return false;
548  }
549  return true;
550 }
bool Empty(PageIteratorLevel level) const
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
virtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 421 of file resultiterator.cpp.

421  {
422  if (it_->block() == NULL) return false; // already at end!
423  switch (level) {
424  case RIL_BLOCK: // explicit fall-through
425  case RIL_PARA: // explicit fall-through
426  case RIL_TEXTLINE:
427  if (!PageIterator::Next(level)) return false;
429  // if we've advanced to a new paragraph,
430  // recalculate current_paragraph_is_ltr_
431  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
432  }
433  in_minor_direction_ = false;
434  MoveToLogicalStartOfTextline();
435  return it_->block() != NULL;
436  case RIL_SYMBOL:
437  {
438  GenericVector<int> blob_order;
439  CalculateBlobOrder(&blob_order);
440  int next_blob = 0;
441  while (next_blob < blob_order.size() &&
442  blob_index_ != blob_order[next_blob])
443  next_blob++;
444  next_blob++;
445  if (next_blob < blob_order.size()) {
446  // we're in the same word; simply advance one blob.
447  BeginWord(blob_order[next_blob]);
448  at_beginning_of_minor_run_ = false;
449  return true;
450  }
451  level = RIL_WORD; // we've fallen through to the next word.
452  }
453  case RIL_WORD: // explicit fall-through.
454  {
455  if (it_->word() == NULL) return Next(RIL_BLOCK);
456  GenericVectorEqEq<int> word_indices;
457  int this_word_index = LTRWordIndex();
458  CalculateTextlineOrder(current_paragraph_is_ltr_,
459  *this,
460  &word_indices);
461  int final_real_index = word_indices.size() - 1;
462  while (final_real_index > 0 && word_indices[final_real_index] < 0)
463  final_real_index--;
464  for (int i = 0; i < final_real_index; i++) {
465  if (word_indices[i] == this_word_index) {
466  int j = i + 1;
467  for (; j < final_real_index && word_indices[j] < 0; j++) {
468  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
469  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
470  }
471  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
472  // awesome, we move to word_indices[j]
473  if (BidiDebug(3)) {
474  tprintf("Next(RIL_WORD): %d -> %d\n",
475  this_word_index, word_indices[j]);
476  }
478  for (int k = 0; k < word_indices[j]; k++) {
480  }
481  MoveToLogicalStartOfWord();
482  return true;
483  }
484  }
485  if (BidiDebug(3)) {
486  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
487  }
488  // we're going off the end of the text line.
489  return Next(RIL_TEXTLINE);
490  }
491  }
492  ASSERT_HOST(false); // shouldn't happen.
493  return false;
494 }
WERD_RES * word() const
Definition: pageres.h:736
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
TESS_LOCAL void BeginWord(int offset)
virtual bool Next(PageIteratorLevel level)
static const int kMinorRunEnd
virtual void RestartRow()
static const int kMinorRunStart
BLOCK_RES * block() const
Definition: pageres.h:742
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 53 of file resultiterator.cpp.

53  {
54  return current_paragraph_is_ltr_;
55 }

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Definition at line 48 of file resultiterator.cpp.

49  {
50  return new ResultIterator(resit);
51 }
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)

Member Data Documentation

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3
static

Definition at line 130 of file resultiterator.h.

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

Definition at line 129 of file resultiterator.h.

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1
static

Definition at line 128 of file resultiterator.h.


The documentation for this class was generated from the following files: