tesseract  3.05.02
resultiterator.cpp
Go to the documentation of this file.
1 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 // Created: Fri May 27 13:58:06 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "resultiterator.h"
23 
24 #include "allheaders.h"
25 #include "pageres.h"
26 #include "strngs.h"
27 #include "tesseractclass.h"
28 #include "unicharset.h"
29 #include "unicodes.h"
30 
31 namespace tesseract {
32 
34  : LTRResultIterator(resit) {
35  in_minor_direction_ = false;
36  at_beginning_of_minor_run_ = false;
37  preserve_interword_spaces_ = false;
38 
39  BoolParam *p = ParamUtils::FindParam<BoolParam>(
40  "preserve_interword_spaces", GlobalParams()->bool_params,
42  if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
43 
44  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
45  MoveToLogicalStartOfTextline();
46 }
47 
49  const LTRResultIterator &resit) {
50  return new ResultIterator(resit);
51 }
52 
54  return current_paragraph_is_ltr_;
55 }
56 
57 bool ResultIterator::CurrentParagraphIsLtr() const {
58  if (!it_->word())
59  return true; // doesn't matter.
60  LTRResultIterator it(*this);
61  it.RestartParagraph();
62  // Try to figure out the ltr-ness of the paragraph. The rules below
63  // make more sense in the context of a difficult paragraph example.
64  // Here we denote {ltr characters, RTL CHARACTERS}:
65  //
66  // "don't go in there!" DAIS EH
67  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
68  // .GNIDLIUB GNINRUB
69  //
70  // On the first line, the left-most word is LTR and the rightmost word
71  // is RTL. Thus, we are better off taking the majority direction for
72  // the whole paragraph contents. So instead of "the leftmost word is LTR"
73  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
74  // would not do: Typically an RTL paragraph would *not* start with an LTR
75  // word. So our heuristics are as follows:
76  //
77  // (1) If the first text line has an RTL word in the left-most position
78  // it is RTL.
79  // (2) If the first text line has an LTR word in the right-most position
80  // it is LTR.
81  // (3) If neither of the above is true, take the majority count for the
82  // paragraph -- if there are more rtl words, it is RTL. If there
83  // are more LTR words, it's LTR.
84  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
85  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
86  int num_ltr, num_rtl;
87  num_rtl = leftmost_rtl ? 1 : 0;
88  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
89  for (it.Next(RIL_WORD);
90  !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
91  it.Next(RIL_WORD)) {
92  StrongScriptDirection dir = it.WordDirection();
93  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
94  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
95  num_ltr += rightmost_ltr ? 1 : 0;
96  }
97  if (leftmost_rtl)
98  return false;
99  if (rightmost_ltr)
100  return true;
101  // First line is ambiguous. Take statistics on the whole paragraph.
102  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
103  StrongScriptDirection dir = it.WordDirection();
104  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
105  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
106  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
107  return num_ltr >= num_rtl;
108 }
109 
110 const int ResultIterator::kMinorRunStart = -1;
111 const int ResultIterator::kMinorRunEnd = -2;
112 const int ResultIterator::kComplexWord = -3;
113 
114 void ResultIterator::CalculateBlobOrder(
115  GenericVector<int> *blob_indices) const {
116  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
117  blob_indices->clear();
118  if (Empty(RIL_WORD)) return;
119  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
120  // Easy! just return the blobs in order;
121  for (int i = 0; i < word_length_; i++)
122  blob_indices->push_back(i);
123  return;
124  }
125 
126  // The blobs are in left-to-right order, but the current reading context
127  // is right-to-left.
128  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
129  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
130  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
131  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
132  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
133  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
134  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
135 
136  // Step 1: Scan for and mark European Number sequences
137  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
138  GenericVector<int> letter_types;
139  for (int i = 0; i < word_length_; i++) {
140  letter_types.push_back(it_->word()->SymbolDirection(i));
141  }
142  // Convert a single separtor sandwiched between two EN's into an EN.
143  for (int i = 0; i + 2 < word_length_; i++) {
144  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
145  (letter_types[i + 1] == U_EURO_NUM_SEP ||
146  letter_types[i + 1] == U_COMMON_NUM_SEP)) {
147  letter_types[i + 1] = U_EURO_NUM;
148  }
149  }
150  // Scan for sequences of European Number Terminators around ENs and convert
151  // them to ENs.
152  for (int i = 0; i < word_length_; i++) {
153  if (letter_types[i] == U_EURO_NUM_TERM) {
154  int j = i + 1;
155  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
156  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
157  // The sequence [i..j] should be converted to all European Numbers.
158  for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
159  }
160  j = i - 1;
161  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
162  if (j > -1 && letter_types[j] == U_EURO_NUM) {
163  // The sequence [j..i] should be converted to all European Numbers.
164  for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
165  }
166  }
167  }
168  // Step 2: Convert all remaining types to either L or R.
169  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
170  // All other are R.
171  for (int i = 0; i < word_length_;) {
172  int ti = letter_types[i];
173  if (ti == U_LTR || ti == U_EURO_NUM) {
174  // Left to right sequence; scan to the end of it.
175  int last_good = i;
176  for (int j = i + 1; j < word_length_; j++) {
177  int tj = letter_types[j];
178  if (tj == U_LTR || tj == U_EURO_NUM) {
179  last_good = j;
180  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
181  // do nothing.
182  } else {
183  break;
184  }
185  }
186  // [i..last_good] is the L sequence
187  for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
188  i = last_good + 1;
189  } else {
190  letter_types[i] = U_RTL;
191  i++;
192  }
193  }
194 
195  // At this point, letter_types is entirely U_LTR or U_RTL.
196  for (int i = word_length_ - 1; i >= 0;) {
197  if (letter_types[i] == U_RTL) {
198  blob_indices->push_back(i);
199  i--;
200  } else {
201  // left to right sequence. scan to the beginning.
202  int j = i - 1;
203  for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
204  // Now (j, i] is LTR
205  for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
206  i = j;
207  }
208  }
209  ASSERT_HOST(blob_indices->size() == word_length_);
210 }
211 
212 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
213  for (int i = 0; i < dirs.size(); i++) {
214  switch (dirs[i]) {
215  case DIR_NEUTRAL: tprintf ("N "); break;
216  case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
217  case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
218  case DIR_MIX: tprintf("Z "); break;
219  default: tprintf("? "); break;
220  }
221  }
222  tprintf("\n");
223 }
224 
226  bool paragraph_is_ltr,
227  const LTRResultIterator &resit,
228  GenericVectorEqEq<int> *word_indices) const {
230  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
231 }
232 
234  bool paragraph_is_ltr,
235  const LTRResultIterator &resit,
237  GenericVectorEqEq<int> *word_indices) const {
240  directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
241  directions->truncate(0);
242 
243  // A LTRResultIterator goes strictly left-to-right word order.
244  LTRResultIterator ltr_it(resit);
245  ltr_it.RestartRow();
246  if (ltr_it.Empty(RIL_WORD)) return;
247  do {
248  directions->push_back(ltr_it.WordDirection());
249  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
250 
251  word_indices->truncate(0);
252  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
253 }
254 
256  bool paragraph_is_ltr,
257  const GenericVector<StrongScriptDirection> &word_dirs,
258  GenericVectorEqEq<int> *reading_order) {
259  reading_order->truncate(0);
260  if (word_dirs.size() == 0) return;
261 
262  // Take all of the runs of minor direction words and insert them
263  // in reverse order.
264  int minor_direction, major_direction, major_step, start, end;
265  if (paragraph_is_ltr) {
266  start = 0;
267  end = word_dirs.size();
268  major_step = 1;
269  major_direction = DIR_LEFT_TO_RIGHT;
270  minor_direction = DIR_RIGHT_TO_LEFT;
271  } else {
272  start = word_dirs.size() - 1;
273  end = -1;
274  major_step = -1;
275  major_direction = DIR_RIGHT_TO_LEFT;
276  minor_direction = DIR_LEFT_TO_RIGHT;
277  // Special rule: if there are neutral words at the right most side
278  // of a line adjacent to a left-to-right word in the middle of the
279  // line, we interpret the end of the line as a single LTR sequence.
280  if (word_dirs[start] == DIR_NEUTRAL) {
281  int neutral_end = start;
282  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
283  neutral_end--;
284  }
285  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
286  // LTR followed by neutrals.
287  // Scan for the beginning of the minor left-to-right run.
288  int left = neutral_end;
289  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
290  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
291  }
292  reading_order->push_back(kMinorRunStart);
293  for (int i = left; i < word_dirs.size(); i++) {
294  reading_order->push_back(i);
295  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
296  }
297  reading_order->push_back(kMinorRunEnd);
298  start = left - 1;
299  }
300  }
301  }
302  for (int i = start; i != end;) {
303  if (word_dirs[i] == minor_direction) {
304  int j = i;
305  while (j != end && word_dirs[j] != major_direction)
306  j += major_step;
307  if (j == end) j -= major_step;
308  while (j != i && word_dirs[j] != minor_direction)
309  j -= major_step;
310  // [j..i] is a minor direction run.
311  reading_order->push_back(kMinorRunStart);
312  for (int k = j; k != i; k -= major_step) {
313  reading_order->push_back(k);
314  }
315  reading_order->push_back(i);
316  reading_order->push_back(kMinorRunEnd);
317  i = j + major_step;
318  } else {
319  reading_order->push_back(i);
320  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
321  i += major_step;
322  }
323  }
324 }
325 
326 int ResultIterator::LTRWordIndex() const {
327  int this_word_index = 0;
328  LTRResultIterator textline(*this);
329  textline.RestartRow();
330  while (!textline.PositionedAtSameWord(it_)) {
331  this_word_index++;
332  textline.Next(RIL_WORD);
333  }
334  return this_word_index;
335 }
336 
337 void ResultIterator::MoveToLogicalStartOfWord() {
338  if (word_length_ == 0) {
339  BeginWord(0);
340  return;
341  }
342  GenericVector<int> blob_order;
343  CalculateBlobOrder(&blob_order);
344  if (blob_order.size() == 0 || blob_order[0] == 0) return;
345  BeginWord(blob_order[0]);
346 }
347 
348 bool ResultIterator::IsAtFinalSymbolOfWord() const {
349  if (!it_->word()) return true;
350  GenericVector<int> blob_order;
351  CalculateBlobOrder(&blob_order);
352  return blob_order.size() == 0 || blob_order.back() == blob_index_;
353 }
354 
355 bool ResultIterator::IsAtFirstSymbolOfWord() const {
356  if (!it_->word()) return true;
357  GenericVector<int> blob_order;
358  CalculateBlobOrder(&blob_order);
359  return blob_order.size() == 0 || blob_order[0] == blob_index_;
360 }
361 
362 void ResultIterator::AppendSuffixMarks(STRING *text) const {
363  if (!it_->word()) return;
364  bool reading_direction_is_ltr =
365  current_paragraph_is_ltr_ ^ in_minor_direction_;
366  // scan forward to see what meta-information the word ordering algorithm
367  // left us.
368  // If this word is at the *end* of a minor run, insert the other
369  // direction's mark; else if this was a complex word, insert the
370  // current reading order's mark.
371  GenericVectorEqEq<int> textline_order;
372  CalculateTextlineOrder(current_paragraph_is_ltr_,
373  *this, &textline_order);
374  int this_word_index = LTRWordIndex();
375  int i = textline_order.get_index(this_word_index);
376  if (i < 0) return;
377 
378  int last_non_word_mark = 0;
379  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
380  last_non_word_mark = textline_order[i];
381  }
382  if (last_non_word_mark == kComplexWord) {
383  *text += reading_direction_is_ltr ? kLRM : kRLM;
384  } else if (last_non_word_mark == kMinorRunEnd) {
385  if (current_paragraph_is_ltr_) {
386  *text += kLRM;
387  } else {
388  *text += kRLM;
389  }
390  }
391 }
392 
393 void ResultIterator::MoveToLogicalStartOfTextline() {
394  GenericVectorEqEq<int> word_indices;
395  RestartRow();
396  CalculateTextlineOrder(current_paragraph_is_ltr_,
397  dynamic_cast<const LTRResultIterator&>(*this),
398  &word_indices);
399  int i = 0;
400  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
401  if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
402  else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
403  }
404  if (in_minor_direction_) at_beginning_of_minor_run_ = true;
405  if (i >= word_indices.size()) return;
406  int first_word_index = word_indices[i];
407  for (int j = 0; j < first_word_index; j++) {
409  }
410  MoveToLogicalStartOfWord();
411 }
412 
415  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
416  in_minor_direction_ = false;
417  at_beginning_of_minor_run_ = false;
418  MoveToLogicalStartOfTextline();
419 }
420 
422  if (it_->block() == NULL) return false; // already at end!
423  switch (level) {
424  case RIL_BLOCK: // explicit fall-through
425  case RIL_PARA: // explicit fall-through
426  case RIL_TEXTLINE:
427  if (!PageIterator::Next(level)) return false;
429  // if we've advanced to a new paragraph,
430  // recalculate current_paragraph_is_ltr_
431  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
432  }
433  in_minor_direction_ = false;
434  MoveToLogicalStartOfTextline();
435  return it_->block() != NULL;
436  case RIL_SYMBOL:
437  {
438  GenericVector<int> blob_order;
439  CalculateBlobOrder(&blob_order);
440  int next_blob = 0;
441  while (next_blob < blob_order.size() &&
442  blob_index_ != blob_order[next_blob])
443  next_blob++;
444  next_blob++;
445  if (next_blob < blob_order.size()) {
446  // we're in the same word; simply advance one blob.
447  BeginWord(blob_order[next_blob]);
448  at_beginning_of_minor_run_ = false;
449  return true;
450  }
451  level = RIL_WORD; // we've fallen through to the next word.
452  }
453  case RIL_WORD: // explicit fall-through.
454  {
455  if (it_->word() == NULL) return Next(RIL_BLOCK);
456  GenericVectorEqEq<int> word_indices;
457  int this_word_index = LTRWordIndex();
458  CalculateTextlineOrder(current_paragraph_is_ltr_,
459  *this,
460  &word_indices);
461  int final_real_index = word_indices.size() - 1;
462  while (final_real_index > 0 && word_indices[final_real_index] < 0)
463  final_real_index--;
464  for (int i = 0; i < final_real_index; i++) {
465  if (word_indices[i] == this_word_index) {
466  int j = i + 1;
467  for (; j < final_real_index && word_indices[j] < 0; j++) {
468  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
469  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
470  }
471  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
472  // awesome, we move to word_indices[j]
473  if (BidiDebug(3)) {
474  tprintf("Next(RIL_WORD): %d -> %d\n",
475  this_word_index, word_indices[j]);
476  }
478  for (int k = 0; k < word_indices[j]; k++) {
480  }
481  MoveToLogicalStartOfWord();
482  return true;
483  }
484  }
485  if (BidiDebug(3)) {
486  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
487  }
488  // we're going off the end of the text line.
489  return Next(RIL_TEXTLINE);
490  }
491  }
492  ASSERT_HOST(false); // shouldn't happen.
493  return false;
494 }
495 
497  if (it_->block() == NULL) return false; // Already at the end!
498  if (it_->word() == NULL) return true; // In an image block.
499  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
500 
501  bool at_word_start = IsAtFirstSymbolOfWord();
502  if (level == RIL_WORD) return at_word_start;
503 
504  ResultIterator line_start(*this);
505  // move to the first word in the line...
506  line_start.MoveToLogicalStartOfTextline();
507 
508  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
509  if (level == RIL_TEXTLINE) return at_textline_start;
510 
511  // now we move to the left-most word...
512  line_start.RestartRow();
513  bool at_block_start = at_textline_start &&
514  line_start.it_->block() != line_start.it_->prev_block();
515  if (level == RIL_BLOCK) return at_block_start;
516 
517  bool at_para_start = at_block_start ||
518  (at_textline_start &&
519  line_start.it_->row()->row->para() !=
520  line_start.it_->prev_row()->row->para());
521  if (level == RIL_PARA) return at_para_start;
522 
523  ASSERT_HOST(false); // shouldn't happen.
524  return false;
525 }
526 
533  PageIteratorLevel element) const {
534  if (Empty(element)) return true; // Already at the end!
535  // The result is true if we step forward by element and find we are
536  // at the the end of the page or at beginning of *all* levels in:
537  // [level, element).
538  // When there is more than one level difference between element and level,
539  // we could for instance move forward one symbol and still be at the first
540  // word on a line, so we also have to be at the first symbol in a word.
541  ResultIterator next(*this);
542  next.Next(element);
543  if (next.Empty(element)) return true; // Reached the end of the page.
544  while (element > level) {
545  element = static_cast<PageIteratorLevel>(element - 1);
546  if (!next.IsAtBeginningOf(element))
547  return false;
548  }
549  return true;
550 }
551 
557  if (it_->word() == NULL) return NULL; // Already at the end!
558  STRING text;
559  switch (level) {
560  case RIL_BLOCK:
561  {
562  ResultIterator pp(*this);
563  do {
564  pp.AppendUTF8ParagraphText(&text);
565  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
566  }
567  break;
568  case RIL_PARA:
569  AppendUTF8ParagraphText(&text);
570  break;
571  case RIL_TEXTLINE:
572  {
573  ResultIterator it(*this);
574  it.MoveToLogicalStartOfTextline();
575  it.IterateAndAppendUTF8TextlineText(&text);
576  }
577  break;
578  case RIL_WORD:
579  AppendUTF8WordText(&text);
580  break;
581  case RIL_SYMBOL:
582  {
583  bool reading_direction_is_ltr =
584  current_paragraph_is_ltr_ ^ in_minor_direction_;
585  if (at_beginning_of_minor_run_) {
586  text += reading_direction_is_ltr ? kLRM : kRLM;
587  }
588  text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
589  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
590  }
591  break;
592  }
593  int length = text.length() + 1;
594  char* result = new char[length];
595  strncpy(result, text.string(), length);
596  return result;
597 }
598 
599 void ResultIterator::AppendUTF8WordText(STRING *text) const {
600  if (!it_->word()) return;
601  ASSERT_HOST(it_->word()->best_choice != NULL);
602  bool reading_direction_is_ltr =
603  current_paragraph_is_ltr_ ^ in_minor_direction_;
604  if (at_beginning_of_minor_run_) {
605  *text += reading_direction_is_ltr ? kLRM : kRLM;
606  }
607 
608  GenericVector<int> blob_order;
609  CalculateBlobOrder(&blob_order);
610  for (int i = 0; i < blob_order.size(); i++) {
611  *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
612  }
613  AppendSuffixMarks(text);
614 }
615 
616 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
617  if (Empty(RIL_WORD)) {
618  Next(RIL_WORD);
619  return;
620  }
621  if (BidiDebug(1)) {
622  GenericVectorEqEq<int> textline_order;
624  CalculateTextlineOrder(current_paragraph_is_ltr_,
625  *this, &dirs, &textline_order);
626  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
627  current_paragraph_is_ltr_ ? "ltr" : "rtl");
628  PrintScriptDirs(dirs);
629  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
630  current_paragraph_is_ltr_ ? "ltr" : "rtl");
631  for (int i = 0; i < textline_order.size(); i++) {
632  tprintf("%d ", textline_order[i]);
633  }
634  tprintf("\n");
635  }
636 
637  int words_appended = 0;
638  do {
639  int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
640  : (words_appended > 0);
641  for (int i = 0; i < numSpaces; ++i) {
642  *text += " ";
643  }
644  AppendUTF8WordText(text);
645  words_appended++;
646  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
647  if (BidiDebug(1)) {
648  tprintf("%d words printed\n", words_appended);
649  }
650  *text += line_separator_;
651  // If we just finished a paragraph, add an extra newline.
652  if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
653  *text += paragraph_separator_;
654 }
655 
656 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
657  ResultIterator it(*this);
658  it.RestartParagraph();
659  it.MoveToLogicalStartOfTextline();
660  if (it.Empty(RIL_WORD)) return;
661  do {
662  it.IterateAndAppendUTF8TextlineText(text);
663  } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
664 }
665 
666 bool ResultIterator::BidiDebug(int min_level) const {
667  int debug_level = 1;
668  IntParam *p = ParamUtils::FindParam<IntParam>(
669  "bidi_debug", GlobalParams()->int_params,
671  if (p != NULL) debug_level = (inT32)(*p);
672  return debug_level >= min_level;
673 }
674 
675 } // namespace tesseract.
ROW * row
Definition: pageres.h:127
GenericVector< BoolParam * > bool_params
Definition: params.h:45
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
PARA * para() const
Definition: ocrrow.h:115
WERD_RES * word() const
Definition: pageres.h:736
BLOCK_RES * prev_block() const
Definition: pageres.h:733
inT32 length() const
Definition: strngs.cpp:196
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
const char * kRLM
Definition: unicodes.cpp:28
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:345
StrongScriptDirection
Definition: unichar.h:40
ROW_RES * row() const
Definition: pageres.h:739
virtual char * GetUTF8Text(PageIteratorLevel level) const
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
int get_index(T object) const
T & back() const
TESS_LOCAL void BeginWord(int offset)
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool UnicharsInReadingOrder() const
Definition: pageres.h:409
int push_back(T object)
GenericVector< IntParam * > int_params
Definition: params.h:44
const char * string() const
Definition: strngs.cpp:201
uinT8 space()
Definition: werd.h:104
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
bool Empty(PageIteratorLevel level) const
virtual bool Next(PageIteratorLevel level)
void truncate(int size)
static const int kMinorRunEnd
virtual void RestartRow()
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
static const int kMinorRunStart
ROW_RES * prev_row() const
Definition: pageres.h:730
BLOCK_RES * block() const
Definition: pageres.h:742
int inT32
Definition: host.h:35
ParamsVectors * params()
Definition: ccutil.h:63
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
static const int kComplexWord
const char * kLRM
Definition: unicodes.cpp:27
WERD * word
Definition: pageres.h:175
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:367
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)