tesseract  3.05.02
equationdetect.cpp
Go to the documentation of this file.
1 // File: equationdetect.cpp
3 // Description: Helper classes to detect equations.
4 // Author: Zongyi (Joe) Liu (joeliu@google.com)
5 // Created: Fri Aug 31 11:13:01 PST 2011
6 //
7 // (C) Copyright 2011, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #include <mathfix.h>
23 #include <windows.h>
24 #endif
25 
26 #ifdef __MINGW32__
27 #include <limits.h>
28 #endif
29 
30 #include <float.h>
31 
32 // Include automatically generated configuration file if running autoconf.
33 #ifdef HAVE_CONFIG_H
34 #include "config_auto.h"
35 #endif
36 
37 #include "equationdetect.h"
38 
39 #include "bbgrid.h"
40 #include "classify.h"
41 #include "colpartition.h"
42 #include "colpartitiongrid.h"
43 #include "colpartitionset.h"
44 #include "helpers.h"
45 #include "ratngs.h"
46 #include "tesseractclass.h"
47 
48 // Config variables.
49 BOOL_VAR(equationdetect_save_bi_image, false, "Save input bi image");
50 BOOL_VAR(equationdetect_save_spt_image, false, "Save special character image");
51 BOOL_VAR(equationdetect_save_seed_image, false, "Save the seed image");
52 BOOL_VAR(equationdetect_save_merged_image, false, "Save the merged image");
53 
54 namespace tesseract {
55 
57 // Utility ColParition sort functions.
59 static int SortCPByTopReverse(const void* p1, const void* p2) {
60  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
61  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
62  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
63  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
64  return box2.top() - box1.top();
65 }
66 
67 static int SortCPByBottom(const void* p1, const void* p2) {
68  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
69  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
70  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
71  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
72  return box1.bottom() - box2.bottom();
73 }
74 
75 static int SortCPByHeight(const void* p1, const void* p2) {
76  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
77  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
78  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
79  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
80  return box1.height() - box2.height();
81 }
82 
83 // TODO(joeliu): we may want to parameterize these constants.
84 const float kMathDigitDensityTh1 = 0.25;
85 const float kMathDigitDensityTh2 = 0.1;
86 const float kMathItalicDensityTh = 0.5;
87 const float kUnclearDensityTh = 0.25;
88 const int kSeedBlobsCountTh = 10;
90 
91 // Returns true if PolyBlockType is of text type or equation type.
93  return PTIsTextType(type) || type == PT_EQUATION;
94 }
95 
96 inline bool IsLeftIndented(const EquationDetect::IndentType type) {
97  return type == EquationDetect::LEFT_INDENT ||
99 }
100 
102  return type == EquationDetect::RIGHT_INDENT ||
104 }
105 
106 EquationDetect::EquationDetect(const char* equ_datapath,
107  const char* equ_name) {
108  const char* default_name = "equ";
109  if (equ_name == NULL) {
110  equ_name = default_name;
111  }
112  lang_tesseract_ = NULL;
113  resolution_ = 0;
114  page_count_ = 0;
115 
116  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
118  tprintf("Warning: equation region detection requested,"
119  " but %s failed to load from %s\n", equ_name, equ_datapath);
120  }
121 
122  cps_super_bbox_ = NULL;
123 }
124 
126  delete(cps_super_bbox_);
127 }
128 
130  lang_tesseract_ = lang_tesseract;
131 }
132 
133 void EquationDetect::SetResolution(const int resolution) {
134  resolution_ = resolution;
135 }
136 
138  if (to_block == NULL) {
139  tprintf("Warning: input to_block is NULL!\n");
140  return -1;
141  }
142 
144  blob_lists.push_back(&(to_block->blobs));
145  blob_lists.push_back(&(to_block->large_blobs));
146  for (int i = 0; i < blob_lists.size(); ++i) {
147  BLOBNBOX_IT bbox_it(blob_lists[i]);
148  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
149  bbox_it.forward()) {
150  bbox_it.data()->set_special_text_type(BSTT_NONE);
151  }
152  }
153 
154  return 0;
155 }
156 
158  BLOBNBOX *blobnbox, const int height_th) {
159  ASSERT_HOST(blobnbox != NULL);
160  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
161  // For small blob, we simply set to BSTT_NONE.
162  blobnbox->set_special_text_type(BSTT_NONE);
163  return;
164  }
165 
166  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
167  C_BLOB* blob = blobnbox->cblob();
168  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
169  // each classifier here, as they may require different PolygonalCopy.
170  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
171  const TBOX& box = tblob->bounding_box();
172 
173  // Normalize the blob. Set the origin to the place we want to be the
174  // bottom-middle, and scaling is to make the height the x-height.
175  float scaling = static_cast<float>(kBlnXHeight) / box.height();
176  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
177  TBLOB* normed_blob = new TBLOB(*tblob);
178  normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
179  0.0f, static_cast<float>(kBlnBaselineOffset),
180  false, NULL);
181  equ_tesseract_.AdaptiveClassifier(normed_blob, &ratings_equ);
182  lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang);
183  delete normed_blob;
184  delete tblob;
185 
186  // Get the best choice from ratings_lang and rating_equ. As the choice in the
187  // list has already been sorted by the certainty, we simply use the first
188  // choice.
189  BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL;
190  if (ratings_lang.length() > 0) {
191  BLOB_CHOICE_IT choice_it(&ratings_lang);
192  lang_choice = choice_it.data();
193  }
194  if (ratings_equ.length() > 0) {
195  BLOB_CHOICE_IT choice_it(&ratings_equ);
196  equ_choice = choice_it.data();
197  }
198 
199  float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
200  float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
201 
202  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
203  // The scores here are negative, so the max/min == fabs(min/max).
204  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
205  float diff = fabs(lang_score - equ_score);
207 
208  // Classification.
209  if (fmax(lang_score, equ_score) < kConfScoreTh) {
210  // If both score are very small, then mark it as unclear.
211  type = BSTT_UNCLEAR;
212  } else if (diff > kConfDiffTh && equ_score > lang_score) {
213  // If equ_score is significantly higher, then we classify this character as
214  // math symbol.
215  type = BSTT_MATH;
216  } else if (lang_choice) {
217  // For other cases: lang_score is similar or significantly higher.
218  type = EstimateTypeForUnichar(
219  lang_tesseract_->unicharset, lang_choice->unichar_id());
220  }
221 
222  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
223  lang_choice->fontinfo_id()).is_italic()) {
224  // For text symbol, we still check if it is italic.
226  } else {
227  blobnbox->set_special_text_type(type);
228  }
229 }
230 
232  const UNICHARSET& unicharset, const UNICHAR_ID id) const {
233  STRING s = unicharset.id_to_unichar(id);
234  if (unicharset.get_isalpha(id)) {
235  return BSTT_NONE;
236  }
237 
238  if (unicharset.get_ispunctuation(id)) {
239  // Exclude some special texts that are likely to be confused as math symbol.
240  static GenericVector<UNICHAR_ID> ids_to_exclude;
241  if (ids_to_exclude.empty()) {
242  static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
243  "〈", "〉", "《", "》", "」", "「", ""};
244  int i = 0;
245  while (kCharsToEx[i] != "") {
246  ids_to_exclude.push_back(
247  unicharset.unichar_to_id(kCharsToEx[i++].string()));
248  }
249  ids_to_exclude.sort();
250  }
251  return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
252  }
253 
254  // Check if it is digit. In addition to the isdigit attribute, we also check
255  // if this character belongs to those likely to be confused with a digit.
256  static const STRING kDigitsChars = "|";
257  if (unicharset.get_isdigit(id) ||
258  (s.length() == 1 && kDigitsChars.contains(s[0]))) {
259  return BSTT_DIGIT;
260  } else {
261  return BSTT_MATH;
262  }
263 }
264 
266  // Set configuration for Tesseract::AdaptiveClassifier.
267  equ_tesseract_.tess_cn_matching.set_value(true); // turn it on
268  equ_tesseract_.tess_bn_matching.set_value(false);
269 
270  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
271  int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
272  int classify_integer_matcher =
276 
278  ColPartition *part = NULL;
279  gsearch.StartFullSearch();
280  while ((part = gsearch.NextFullSearch()) != NULL) {
281  if (!IsTextOrEquationType(part->type())) {
282  continue;
283  }
284  IdentifyBlobsToSkip(part);
285  BLOBNBOX_C_IT bbox_it(part->boxes());
286  // Compute the height threshold.
287  GenericVector<int> blob_heights;
288  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
289  bbox_it.forward()) {
290  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
291  blob_heights.push_back(bbox_it.data()->bounding_box().height());
292  }
293  }
294  blob_heights.sort();
295  int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
296  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
297  bbox_it.forward()) {
298  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
299  IdentifySpecialText(bbox_it.data(), height_th);
300  }
301  }
302  }
303 
304  // Set the multiplier values back.
306  classify_class_pruner);
308  classify_integer_matcher);
309 
310  if (equationdetect_save_spt_image) { // For debug.
311  STRING outfile;
312  GetOutputTiffName("_spt", &outfile);
313  PaintSpecialTexts(outfile);
314  }
315 }
316 
318  ASSERT_HOST(part);
319  BLOBNBOX_C_IT blob_it(part->boxes());
320 
321  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
322  // At this moment, no blob should have been joined.
323  ASSERT_HOST(!blob_it.data()->joined_to_prev());
324  }
325  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
326  BLOBNBOX* blob = blob_it.data();
327  if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
328  continue;
329  }
330  TBOX blob_box = blob->bounding_box();
331 
332  // Search if any blob can be merged into blob. If found, then we mark all
333  // these blobs as BSTT_SKIP.
334  BLOBNBOX_C_IT blob_it2 = blob_it;
335  bool found = false;
336  while (!blob_it2.at_last()) {
337  BLOBNBOX* nextblob = blob_it2.forward();
338  const TBOX& nextblob_box = nextblob->bounding_box();
339  if (nextblob_box.left() >= blob_box.right()) {
340  break;
341  }
342  const float kWidthR = 0.4, kHeightR = 0.3;
343  bool xoverlap = blob_box.major_x_overlap(nextblob_box),
344  yoverlap = blob_box.y_overlap(nextblob_box);
345  float widthR = static_cast<float>(
346  MIN(nextblob_box.width(), blob_box.width())) /
347  MAX(nextblob_box.width(), blob_box.width());
348  float heightR = static_cast<float>(
349  MIN(nextblob_box.height(), blob_box.height())) /
350  MAX(nextblob_box.height(), blob_box.height());
351 
352  if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
353  // Found one, set nextblob type and recompute blob_box.
354  found = true;
355  nextblob->set_special_text_type(BSTT_SKIP);
356  blob_box += nextblob_box;
357  }
358  }
359  if (found) {
361  }
362  }
363 }
364 
366  ColPartitionGrid* part_grid, ColPartitionSet** best_columns) {
367  if (!lang_tesseract_) {
368  tprintf("Warning: lang_tesseract_ is NULL!\n");
369  return -1;
370  }
371  if (!part_grid || !best_columns) {
372  tprintf("part_grid/best_columns is NULL!!\n");
373  return -1;
374  }
375  cp_seeds_.clear();
376  part_grid_ = part_grid;
377  best_columns_ = best_columns;
379  STRING outfile;
380  page_count_++;
381 
383  GetOutputTiffName("_bi", &outfile);
384  pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
385  }
386 
387  // Pass 0: Compute special text type for blobs.
389 
390  // Pass 1: Merge parts by overlap.
392 
393  // Pass 2: compute the math blob density and find the seed partition.
395  // We still need separate seed into block seed and inline seed partition.
397 
399  GetOutputTiffName("_seed", &outfile);
400  PaintColParts(outfile);
401  }
402 
403  // Pass 3: expand block equation seeds.
404  while (!cp_seeds_.empty()) {
405  GenericVector<ColPartition*> seeds_expanded;
406  for (int i = 0; i < cp_seeds_.size(); ++i) {
407  if (ExpandSeed(cp_seeds_[i])) {
408  // If this seed is expanded, then we add it into seeds_expanded. Note
409  // this seed has been removed from part_grid_ if it is expanded.
410  seeds_expanded.push_back(cp_seeds_[i]);
411  }
412  }
413  // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
414  for (int i = 0; i < seeds_expanded.size(); ++i) {
415  InsertPartAfterAbsorb(seeds_expanded[i]);
416  }
417  cp_seeds_ = seeds_expanded;
418  }
419 
420  // Pass 4: find math block satellite text partitions and merge them.
422 
423  if (equationdetect_save_merged_image) { // For debug.
424  GetOutputTiffName("_merged", &outfile);
425  PaintColParts(outfile);
426  }
427 
428  return 0;
429 }
430 
432  while (true) {
433  ColPartition* part = NULL;
434  // partitions that have been updated.
435  GenericVector<ColPartition*> parts_updated;
437  gsearch.StartFullSearch();
438  while ((part = gsearch.NextFullSearch()) != NULL) {
439  if (!IsTextOrEquationType(part->type())) {
440  continue;
441  }
442  GenericVector<ColPartition*> parts_to_merge;
443  SearchByOverlap(part, &parts_to_merge);
444  if (parts_to_merge.empty()) {
445  continue;
446  }
447 
448  // Merge parts_to_merge with part, and remove them from part_grid_.
449  part_grid_->RemoveBBox(part);
450  for (int i = 0; i < parts_to_merge.size(); ++i) {
451  ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part);
452  part->Absorb(parts_to_merge[i], NULL);
453  }
454  gsearch.RepositionIterator();
455 
456  parts_updated.push_back(part);
457  }
458 
459  if (parts_updated.empty()) { // Exit the loop
460  break;
461  }
462 
463  // Re-insert parts_updated into part_grid_.
464  for (int i = 0; i < parts_updated.size(); ++i) {
465  InsertPartAfterAbsorb(parts_updated[i]);
466  }
467  }
468 }
469 
471  ColPartition* seed,
472  GenericVector<ColPartition*>* parts_overlap) {
473  ASSERT_HOST(seed != NULL && parts_overlap != NULL);
474  if (!IsTextOrEquationType(seed->type())) {
475  return;
476  }
478  const TBOX& seed_box(seed->bounding_box());
479  const int kRadNeighborCells = 30;
480  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
481  (seed_box.top() + seed_box.bottom()) / 2,
482  kRadNeighborCells);
483  search.SetUniqueMode(true);
484 
485  // Search iteratively.
486  ColPartition *part;
488  const float kLargeOverlapTh = 0.95;
489  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
490  while ((part = search.NextRadSearch()) != NULL) {
491  if (part == seed || !IsTextOrEquationType(part->type())) {
492  continue;
493  }
494  const TBOX& part_box(part->bounding_box());
495  bool merge = false;
496 
497  float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
498  y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
499 
500  // If part is large overlapped with seed, then set merge to true.
501  if (x_overlap_fraction >= kLargeOverlapTh &&
502  y_overlap_fraction >= kLargeOverlapTh) {
503  merge = true;
504  } else if (seed->type() == PT_EQUATION &&
505  IsTextOrEquationType(part->type())) {
506  if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
507  (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
508  merge = true;
509  }
510  }
511 
512  if (merge) { // Remove the part from search and put it into parts.
513  search.RemoveBBox();
514  parts_overlap->push_back(part);
515  }
516  }
517 }
518 
520  ASSERT_HOST(part);
521 
522  // Before insert part back into part_grid_, we will need re-compute some
523  // of its attributes such as first_column_, last_column_. However, we still
524  // want to preserve its type.
525  BlobTextFlowType flow_type = part->flow();
526  PolyBlockType part_type = part->type();
527  BlobRegionType blob_type = part->blob_type();
528 
529  // Call SetPartitionType to re-compute the attributes of part.
530  const TBOX& part_box(part->bounding_box());
531  int grid_x, grid_y;
533  part_box.left(), part_box.bottom(), &grid_x, &grid_y);
535 
536  // Reset the types back.
537  part->set_type(part_type);
538  part->set_blob_type(blob_type);
539  part->set_flow(flow_type);
540  part->SetBlobTypes();
541 
542  // Insert into part_grid_.
543  part_grid_->InsertBBox(true, true, part);
544 }
545 
548  ColPartition *part = NULL;
549  gsearch.StartFullSearch();
550 
551  GenericVector<ColPartition*> seeds1, seeds2;
552  // The left coordinates of indented text partitions.
553  GenericVector<int> indented_texts_left;
554  // The foreground density of text partitions.
555  GenericVector<float> texts_foreground_density;
556  while ((part = gsearch.NextFullSearch()) != NULL) {
557  if (!IsTextOrEquationType(part->type())) {
558  continue;
559  }
561  bool blobs_check = CheckSeedBlobsCount(part);
562  const int kTextBlobsTh = 20;
563 
565  blobs_check) {
566  // Passed high density threshold test, save into seeds1.
567  seeds1.push_back(part);
568  } else {
569  IndentType indent = IsIndented(part);
570  if (IsLeftIndented(indent) && blobs_check &&
572  // Passed low density threshold test and is indented, save into seeds2.
573  seeds2.push_back(part);
574  } else if (!IsRightIndented(indent) &&
575  part->boxes_count() > kTextBlobsTh) {
576  // This is likely to be a text part, save the features.
577  const TBOX&box = part->bounding_box();
578  if (IsLeftIndented(indent)) {
579  indented_texts_left.push_back(box.left());
580  }
581  texts_foreground_density.push_back(ComputeForegroundDensity(box));
582  }
583  }
584  }
585 
586  // Sort the features collected from text regions.
587  indented_texts_left.sort();
588  texts_foreground_density.sort();
589  float foreground_density_th = 0.15; // Default value.
590  if (!texts_foreground_density.empty()) {
591  // Use the median of the texts_foreground_density.
592  foreground_density_th = 0.8 * texts_foreground_density[
593  texts_foreground_density.size() / 2];
594  }
595 
596  for (int i = 0; i < seeds1.size(); ++i) {
597  const TBOX& box = seeds1[i]->bounding_box();
598  if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
599  !(IsLeftIndented(IsIndented(seeds1[i])) &&
600  CountAlignment(indented_texts_left, box.left()) >=
602  // Mark as PT_EQUATION type.
603  seeds1[i]->set_type(PT_EQUATION);
604  cp_seeds_.push_back(seeds1[i]);
605  } else { // Mark as PT_INLINE_EQUATION type.
606  seeds1[i]->set_type(PT_INLINE_EQUATION);
607  }
608  }
609 
610  for (int i = 0; i < seeds2.size(); ++i) {
611  if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
612  seeds2[i]->set_type(PT_EQUATION);
613  cp_seeds_.push_back(seeds2[i]);
614  }
615  }
616 }
617 
619  Pix *pix_bi = lang_tesseract_->pix_binary();
620  int pix_height = pixGetHeight(pix_bi);
621  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
622  tbox.width(), tbox.height());
623  Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL);
624  l_float32 fract;
625  pixForegroundFraction(pix_sub, &fract);
626  pixDestroy(&pix_sub);
627  boxDestroy(&box);
628 
629  return fract;
630 }
631 
632 bool EquationDetect::CheckSeedFgDensity(const float density_th,
633  ColPartition* part) {
634  ASSERT_HOST(part);
635 
636  // Split part horizontall, and check for each sub part.
637  GenericVector<TBOX> sub_boxes;
638  SplitCPHorLite(part, &sub_boxes);
639  float parts_passed = 0.0;
640  for (int i = 0; i < sub_boxes.size(); ++i) {
641  float density = ComputeForegroundDensity(sub_boxes[i]);
642  if (density < density_th) {
643  parts_passed++;
644  }
645  }
646 
647  // If most sub parts passed, then we return true.
648  const float kSeedPartRatioTh = 0.3;
649  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
650 
651  return retval;
652 }
653 
655  GenericVector<ColPartition*>* parts_splitted) {
656  ASSERT_HOST(part && parts_splitted);
657  if (part->median_width() == 0 || part->boxes_count() == 0) {
658  return;
659  }
660 
661  // Make a copy of part, and reset parts_splitted.
662  ColPartition* right_part = part->CopyButDontOwnBlobs();
663  parts_splitted->delete_data_pointers();
664  parts_splitted->clear();
665 
666  const double kThreshold = part->median_width() * 3.0;
667  bool found_split = true;
668  while (found_split) {
669  found_split = false;
670  BLOBNBOX_C_IT box_it(right_part->boxes());
671  // Blobs are sorted left side first. If blobs overlap,
672  // the previous blob may have a "more right" right side.
673  // Account for this by always keeping the largest "right"
674  // so far.
675  int previous_right = MIN_INT32;
676 
677  // Look for the next split in the partition.
678  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
679  const TBOX& box = box_it.data()->bounding_box();
680  if (previous_right != MIN_INT32 &&
681  box.left() - previous_right > kThreshold) {
682  // We have a split position. Split the partition in two pieces.
683  // Insert the left piece in the grid and keep processing the right.
684  int mid_x = (box.left() + previous_right) / 2;
685  ColPartition* left_part = right_part;
686  right_part = left_part->SplitAt(mid_x);
687 
688  parts_splitted->push_back(left_part);
689  left_part->ComputeSpecialBlobsDensity();
690  found_split = true;
691  break;
692  }
693 
694  // The right side of the previous blobs.
695  previous_right = MAX(previous_right, box.right());
696  }
697  }
698 
699  // Add the last piece.
700  right_part->ComputeSpecialBlobsDensity();
701  parts_splitted->push_back(right_part);
702 }
703 
705  GenericVector<TBOX>* splitted_boxes) {
706  ASSERT_HOST(part && splitted_boxes);
707  splitted_boxes->clear();
708  if (part->median_width() == 0) {
709  return;
710  }
711 
712  const double kThreshold = part->median_width() * 3.0;
713 
714  // Blobs are sorted left side first. If blobs overlap,
715  // the previous blob may have a "more right" right side.
716  // Account for this by always keeping the largest "right"
717  // so far.
718  TBOX union_box;
719  int previous_right = MIN_INT32;
720  BLOBNBOX_C_IT box_it(part->boxes());
721  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
722  const TBOX& box = box_it.data()->bounding_box();
723  if (previous_right != MIN_INT32 &&
724  box.left() - previous_right > kThreshold) {
725  // We have a split position.
726  splitted_boxes->push_back(union_box);
727  previous_right = MIN_INT32;
728  }
729  if (previous_right == MIN_INT32) {
730  union_box = box;
731  } else {
732  union_box += box;
733  }
734  // The right side of the previous blobs.
735  previous_right = MAX(previous_right, box.right());
736  }
737 
738  // Add the last piece.
739  if (previous_right != MIN_INT32) {
740  splitted_boxes->push_back(union_box);
741  }
742 }
743 
745  const GenericVector<int>& indented_texts_left,
746  const float foreground_density_th,
747  ColPartition* part) {
748  ASSERT_HOST(part);
749  const TBOX& box = part->bounding_box();
750 
751  // Check if it is aligned with any indented_texts_left.
752  if (!indented_texts_left.empty() &&
753  CountAlignment(indented_texts_left, box.left()) >=
755  return false;
756  }
757 
758  // Check the foreground density.
759  if (ComputeForegroundDensity(box) > foreground_density_th) {
760  return false;
761  }
762 
763  return true;
764 }
765 
767  const GenericVector<int>& sorted_vec, const int val) const {
768  if (sorted_vec.empty()) {
769  return 0;
770  }
771  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
772  int pos = sorted_vec.binary_search(val), count = 0;
773 
774  // Search left side.
775  int index = pos;
776  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
777  count++;
778  }
779 
780  // Search right side.
781  index = pos + 1;
782  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
783  count++;
784  }
785 
786  return count;
787 }
788 
792  int textparts_linespacing = EstimateTextPartLineSpacing();
793  IdentifyInlinePartsVertical(true, textparts_linespacing);
794  IdentifyInlinePartsVertical(false, textparts_linespacing);
795 }
796 
799  ColPartition *part = NULL;
800  gsearch.StartFullSearch();
801  if (cps_super_bbox_) {
802  delete cps_super_bbox_;
803  }
804  cps_super_bbox_ = new TBOX();
805  while ((part = gsearch.NextFullSearch()) != NULL) {
806  (*cps_super_bbox_) += part->bounding_box();
807  }
808 }
809 
813  const int kMarginDiffTh = IntCastRounded(
815  const int kGapTh = static_cast<int>(roundf(
818  search.SetUniqueMode(true);
819  // The center x coordinate of the cp_super_bbox_.
820  int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
821  for (int i = 0; i < cp_seeds_.size(); ++i) {
822  ColPartition* part = cp_seeds_[i];
823  const TBOX& part_box(part->bounding_box());
824  int left_margin = part_box.left() - cps_super_bbox_->left(),
825  right_margin = cps_super_bbox_->right() - part_box.right();
826  bool right_to_left;
827  if (left_margin + kMarginDiffTh < right_margin &&
828  left_margin < kMarginDiffTh) {
829  // part is left aligned, so we search if it has any right neighbor.
830  search.StartSideSearch(
831  part_box.right(), part_box.top(), part_box.bottom());
832  right_to_left = false;
833  } else if (left_margin > cps_cx) {
834  // part locates on the right half on image, so search if it has any left
835  // neighbor.
836  search.StartSideSearch(
837  part_box.left(), part_box.top(), part_box.bottom());
838  right_to_left = true;
839  } else { // part is not an inline equation.
840  new_seeds.push_back(part);
841  continue;
842  }
843  ColPartition* neighbor = NULL;
844  bool side_neighbor_found = false;
845  while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) {
846  const TBOX& neighbor_box(neighbor->bounding_box());
847  if (!IsTextOrEquationType(neighbor->type()) ||
848  part_box.x_gap(neighbor_box) > kGapTh ||
849  !part_box.major_y_overlap(neighbor_box) ||
850  part_box.major_x_overlap(neighbor_box)) {
851  continue;
852  }
853  // We have found one. Set the side_neighbor_found flag.
854  side_neighbor_found = true;
855  break;
856  }
857  if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
859  } else {
860  // Check the geometric feature of neighbor.
861  const TBOX& neighbor_box(neighbor->bounding_box());
862  if (neighbor_box.width() > part_box.width() &&
863  neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
865  } else { // part is not an inline equation type.
866  new_seeds.push_back(part);
867  }
868  }
869  }
870 
871  // Reset the cp_seeds_ using the new_seeds.
872  cp_seeds_ = new_seeds;
873 }
874 
877 
878  // Get the y gap between text partitions;
879  ColPartition *current = NULL, *prev = NULL;
880  gsearch.StartFullSearch();
881  GenericVector<int> ygaps;
882  while ((current = gsearch.NextFullSearch()) != NULL) {
883  if (!PTIsTextType(current->type())) {
884  continue;
885  }
886  if (prev != NULL) {
887  const TBOX &current_box = current->bounding_box();
888  const TBOX &prev_box = prev->bounding_box();
889  // prev and current should be x major overlap and non y overlap.
890  if (current_box.major_x_overlap(prev_box) &&
891  !current_box.y_overlap(prev_box)) {
892  int gap = current_box.y_gap(prev_box);
893  if (gap < MIN(current_box.height(), prev_box.height())) {
894  // The gap should be smaller than the height of the bounding boxes.
895  ygaps.push_back(gap);
896  }
897  }
898  }
899  prev = current;
900  }
901 
902  if (ygaps.size() < 8) { // We do not have enough data.
903  return -1;
904  }
905 
906  // Compute the line spacing from ygaps: use the mean of the first half.
907  ygaps.sort();
908  int spacing = 0, count;
909  for (count = 0; count < ygaps.size() / 2; count++) {
910  spacing += ygaps[count];
911  }
912  return spacing / count;
913 }
914 
916  const bool top_to_bottom, const int textparts_linespacing) {
917  if (cp_seeds_.empty()) {
918  return;
919  }
920 
921  // Sort cp_seeds_.
922  if (top_to_bottom) { // From top to bottom.
923  cp_seeds_.sort(&SortCPByTopReverse);
924  } else { // From bottom to top.
925  cp_seeds_.sort(&SortCPByBottom);
926  }
927 
929  for (int i = 0; i < cp_seeds_.size(); ++i) {
930  ColPartition* part = cp_seeds_[i];
931  // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
932  // for its top neighbors, so that if two/more inline regions are connected
933  // to each other, then we will identify the top one, and then use it to
934  // identify the bottom one.
935  if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
937  } else {
938  new_seeds.push_back(part);
939  }
940  }
941  cp_seeds_ = new_seeds;
942 }
943 
944 bool EquationDetect::IsInline(const bool search_bottom,
945  const int textparts_linespacing,
946  ColPartition* part) {
947  ASSERT_HOST(part != NULL);
948  // Look for its nearest vertical neighbor that hardly overlaps in y but
949  // largely overlaps in x.
951  ColPartition *neighbor = NULL;
952  const TBOX& part_box(part->bounding_box());
953  const float kYGapRatioTh = 1.0;
954 
955  if (search_bottom) {
956  search.StartVerticalSearch(part_box.left(), part_box.right(),
957  part_box.bottom());
958  } else {
959  search.StartVerticalSearch(part_box.left(), part_box.right(),
960  part_box.top());
961  }
962  search.SetUniqueMode(true);
963  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
964  const TBOX& neighbor_box(neighbor->bounding_box());
965  if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
966  MIN(part_box.height(), neighbor_box.height())) {
967  // Finished searching.
968  break;
969  }
970  if (!PTIsTextType(neighbor->type())) {
971  continue;
972  }
973 
974  // Check if neighbor and part is inline similar.
975  const float kHeightRatioTh = 0.5;
976  const int kYGapTh = textparts_linespacing > 0 ?
977  textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
978  static_cast<int>(roundf(0.05 * resolution_)); // Default value.
979  if (part_box.x_overlap(neighbor_box) && // Location feature.
980  part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
981  // Geo feature.
982  static_cast<float>(MIN(part_box.height(), neighbor_box.height())) /
983  MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
984  return true;
985  }
986  }
987 
988  return false;
989 }
990 
992  if (!part) {
993  return false;
994  }
995  const int kSeedMathBlobsCount = 2;
996  const int kSeedMathDigitBlobsCount = 5;
997 
998  int blobs = part->boxes_count(),
999  math_blobs = part->SpecialBlobsCount(BSTT_MATH),
1000  digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
1001  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
1002  math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
1003  return false;
1004  }
1005 
1006  return true;
1007 }
1008 
1010  const float math_density_high,
1011  const float math_density_low,
1012  const ColPartition* part) const {
1013  ASSERT_HOST(part);
1014  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
1016  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
1017  if (math_digit_density > math_density_high) {
1018  return true;
1019  }
1020  if (math_digit_density + italic_density > kMathItalicDensityTh &&
1021  math_digit_density > math_density_low) {
1022  return true;
1023  }
1024 
1025  return false;
1026 }
1027 
1029  ASSERT_HOST(part);
1030 
1032  ColPartition *neighbor = NULL;
1033  const TBOX& part_box(part->bounding_box());
1034  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
1035  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
1036  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
1037 
1038  // Here we use a simple approximation algorithm: from the center of part, We
1039  // perform the radius search, and check if we can find a neighboring parition
1040  // that locates on the top/bottom left of part.
1041  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
1042  (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1043  search.SetUniqueMode(true);
1044  bool left_indented = false, right_indented = false;
1045  while ((neighbor = search.NextRadSearch()) != NULL &&
1046  (!left_indented || !right_indented)) {
1047  if (neighbor == part) {
1048  continue;
1049  }
1050  const TBOX& neighbor_box(neighbor->bounding_box());
1051 
1052  if (part_box.major_y_overlap(neighbor_box) &&
1053  part_box.x_gap(neighbor_box) < kXGapTh) {
1054  // When this happens, it is likely part is a fragment of an
1055  // over-segmented colpartition. So we return false.
1056  return NO_INDENT;
1057  }
1058 
1059  if (!IsTextOrEquationType(neighbor->type())) {
1060  continue;
1061  }
1062 
1063  // The neighbor should be above/below part, and overlap in x direction.
1064  if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1065  continue;
1066  }
1067 
1068  if (part_box.y_gap(neighbor_box) < kYGapTh) {
1069  int left_gap = part_box.left() - neighbor_box.left();
1070  int right_gap = neighbor_box.right() - part_box.right();
1071  if (left_gap > kXGapTh) {
1072  left_indented = true;
1073  }
1074  if (right_gap > kXGapTh) {
1075  right_indented = true;
1076  }
1077  }
1078  }
1079 
1080  if (left_indented && right_indented) {
1081  return BOTH_INDENT;
1082  }
1083  if (left_indented) {
1084  return LEFT_INDENT;
1085  }
1086  if (right_indented) {
1087  return RIGHT_INDENT;
1088  }
1089  return NO_INDENT;
1090 }
1091 
1093  if (seed == NULL || // This seed has been absorbed by other seeds.
1094  seed->IsVerticalType()) { // We skip vertical type right now.
1095  return false;
1096  }
1097 
1098  // Expand in four directions.
1099  GenericVector<ColPartition*> parts_to_merge;
1100  ExpandSeedHorizontal(true, seed, &parts_to_merge);
1101  ExpandSeedHorizontal(false, seed, &parts_to_merge);
1102  ExpandSeedVertical(true, seed, &parts_to_merge);
1103  ExpandSeedVertical(false, seed, &parts_to_merge);
1104  SearchByOverlap(seed, &parts_to_merge);
1105 
1106  if (parts_to_merge.empty()) { // We don't find any partition to merge.
1107  return false;
1108  }
1109 
1110  // Merge all partitions in parts_to_merge with seed. We first remove seed
1111  // from part_grid_ as its bounding box is going to expand. Then we add it
1112  // back after it aborbs all parts_to_merge parititions.
1113  part_grid_->RemoveBBox(seed);
1114  for (int i = 0; i < parts_to_merge.size(); ++i) {
1115  ColPartition* part = parts_to_merge[i];
1116  if (part->type() == PT_EQUATION) {
1117  // If part is in cp_seeds_, then we mark it as NULL so that we won't
1118  // process it again.
1119  for (int j = 0; j < cp_seeds_.size(); ++j) {
1120  if (part == cp_seeds_[j]) {
1121  cp_seeds_[j] = NULL;
1122  break;
1123  }
1124  }
1125  }
1126 
1127  // part has already been removed from part_grid_ in function
1128  // ExpandSeedHorizontal/ExpandSeedVertical.
1129  seed->Absorb(part, NULL);
1130  }
1131 
1132  return true;
1133 }
1134 
1136  const bool search_left,
1137  ColPartition* seed,
1138  GenericVector<ColPartition*>* parts_to_merge) {
1139  ASSERT_HOST(seed != NULL && parts_to_merge != NULL);
1140  const float kYOverlapTh = 0.6;
1141  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
1142 
1144  const TBOX& seed_box(seed->bounding_box());
1145  int x = search_left ? seed_box.left() : seed_box.right();
1146  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
1147  search.SetUniqueMode(true);
1148 
1149  // Search iteratively.
1150  ColPartition *part = NULL;
1151  while ((part = search.NextSideSearch(search_left)) != NULL) {
1152  if (part == seed) {
1153  continue;
1154  }
1155  const TBOX& part_box(part->bounding_box());
1156  if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
1157  break;
1158  }
1159 
1160  // Check part location.
1161  if ((part_box.left() >= seed_box.left() && search_left) ||
1162  (part_box.right() <= seed_box.right() && !search_left)) {
1163  continue;
1164  }
1165 
1166  if (part->type() != PT_EQUATION) { // Non-equation type.
1167  // Skip PT_LINLINE_EQUATION and non text type.
1168  if (part->type() == PT_INLINE_EQUATION ||
1169  (!IsTextOrEquationType(part->type()) &&
1170  part->blob_type() != BRT_HLINE)) {
1171  continue;
1172  }
1173  // For other types, it should be the near small neighbor of seed.
1174  if (!IsNearSmallNeighbor(seed_box, part_box) ||
1175  !CheckSeedNeighborDensity(part)) {
1176  continue;
1177  }
1178  } else { // Equation type, check the y overlap.
1179  if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1180  seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1181  continue;
1182  }
1183  }
1184 
1185  // Passed the check, delete it from search and add into parts_to_merge.
1186  search.RemoveBBox();
1187  parts_to_merge->push_back(part);
1188  }
1189 }
1190 
1192  const bool search_bottom,
1193  ColPartition* seed,
1194  GenericVector<ColPartition*>* parts_to_merge) {
1195  ASSERT_HOST(seed != NULL && parts_to_merge != NULL &&
1196  cps_super_bbox_ != NULL);
1197  const float kXOverlapTh = 0.4;
1198  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
1199 
1201  const TBOX& seed_box(seed->bounding_box());
1202  int y = search_bottom ? seed_box.bottom() : seed_box.top();
1203  search.StartVerticalSearch(
1205  search.SetUniqueMode(true);
1206 
1207  // Search iteratively.
1208  ColPartition *part = NULL;
1210  int skipped_min_top = INT_MAX, skipped_max_bottom = -1;
1211  while ((part = search.NextVerticalSearch(search_bottom)) != NULL) {
1212  if (part == seed) {
1213  continue;
1214  }
1215  const TBOX& part_box(part->bounding_box());
1216 
1217  if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
1218  break;
1219  }
1220 
1221  // Check part location.
1222  if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1223  (part_box.top() <= seed_box.top() && !search_bottom)) {
1224  continue;
1225  }
1226 
1227  bool skip_part = false;
1228  if (part->type() != PT_EQUATION) { // Non-equation type.
1229  // Skip PT_LINLINE_EQUATION and non text type.
1230  if (part->type() == PT_INLINE_EQUATION ||
1231  (!IsTextOrEquationType(part->type()) &&
1232  part->blob_type() != BRT_HLINE)) {
1233  skip_part = true;
1234  } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
1235  !CheckSeedNeighborDensity(part)) {
1236  // For other types, it should be the near small neighbor of seed.
1237  skip_part = true;
1238  }
1239  } else { // Equation type, check the x overlap.
1240  if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1241  seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1242  skip_part = true;
1243  }
1244  }
1245  if (skip_part) {
1246  if (part->type() != PT_EQUATION) {
1247  if (skipped_min_top > part_box.top()) {
1248  skipped_min_top = part_box.top();
1249  }
1250  if (skipped_max_bottom < part_box.bottom()) {
1251  skipped_max_bottom = part_box.bottom();
1252  }
1253  }
1254  } else {
1255  parts.push_back(part);
1256  }
1257  }
1258 
1259  // For every part in parts, we need verify it is not above skipped_min_top
1260  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
1261  // we will skip a part if it looks like:
1262  // search bottom | search top
1263  // seed: ****************** | part: **********
1264  // skipped: xxx | skipped: xxx
1265  // part: ********** | seed: ***********
1266  for (int i = 0; i < parts.size(); i++) {
1267  const TBOX& part_box(parts[i]->bounding_box());
1268  if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
1269  (!search_bottom && part_box.bottom() >= skipped_min_top)) {
1270  continue;
1271  }
1272  // Add parts[i] into parts_to_merge, and delete it from part_grid_.
1273  parts_to_merge->push_back(parts[i]);
1274  part_grid_->RemoveBBox(parts[i]);
1275  }
1276 }
1277 
1279  const TBOX& part_box) const {
1280  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
1281  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
1282 
1283  // Check geometric feature.
1284  if (part_box.height() > seed_box.height() ||
1285  part_box.width() > seed_box.width()) {
1286  return false;
1287  }
1288 
1289  // Check overlap and distance.
1290  if ((!part_box.major_x_overlap(seed_box) ||
1291  part_box.y_gap(seed_box) > kYGapTh) &&
1292  (!part_box.major_y_overlap(seed_box) ||
1293  part_box.x_gap(seed_box) > kXGapTh)) {
1294  return false;
1295  }
1296 
1297  return true;
1298 }
1299 
1301  ASSERT_HOST(part);
1302  if (part->boxes_count() < kSeedBlobsCountTh) {
1303  // Too few blobs, skip the check.
1304  return true;
1305  }
1306 
1307  // We check the math blobs density and the unclear blobs density.
1308  if (part->SpecialBlobsDensity(BSTT_MATH) +
1311  return true;
1312  }
1313 
1314  return false;
1315 }
1316 
1318  // Iterate over part_grid_, and find all parts that are text type but not
1319  // equation type.
1320  ColPartition *part = NULL;
1321  GenericVector<ColPartition*> text_parts;
1323  gsearch.StartFullSearch();
1324  while ((part = gsearch.NextFullSearch()) != NULL) {
1325  if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
1326  text_parts.push_back(part);
1327  }
1328  }
1329  if (text_parts.empty()) {
1330  return;
1331  }
1332 
1333  // Compute the medium height of the text_parts.
1334  text_parts.sort(&SortCPByHeight);
1335  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
1336  int med_height = text_box.height();
1337  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
1338  const TBOX& text_box =
1339  text_parts[text_parts.size() / 2 - 1]->bounding_box();
1340  med_height = static_cast<int>(roundf(
1341  0.5 * (text_box.height() + med_height)));
1342  }
1343 
1344  // Iterate every text_parts and check if it is a math block satellite.
1345  for (int i = 0; i < text_parts.size(); ++i) {
1346  const TBOX& text_box(text_parts[i]->bounding_box());
1347  if (text_box.height() > med_height) {
1348  continue;
1349  }
1350  GenericVector<ColPartition*> math_blocks;
1351  if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
1352  continue;
1353  }
1354 
1355  // Found. merge text_parts[i] with math_blocks.
1356  part_grid_->RemoveBBox(text_parts[i]);
1357  text_parts[i]->set_type(PT_EQUATION);
1358  for (int j = 0; j < math_blocks.size(); ++j) {
1359  part_grid_->RemoveBBox(math_blocks[j]);
1360  text_parts[i]->Absorb(math_blocks[j], NULL);
1361  }
1362  InsertPartAfterAbsorb(text_parts[i]);
1363  }
1364 }
1365 
1367  ColPartition* part, GenericVector<ColPartition*>* math_blocks) {
1368  ASSERT_HOST(part != NULL && math_blocks != NULL);
1369  math_blocks->clear();
1370  const TBOX& part_box(part->bounding_box());
1371  // Find the top/bottom nearest neighbor of part.
1372  ColPartition *neighbors[2];
1373  int y_gaps[2] = {INT_MAX, INT_MAX};
1374  // The horizontal boundary of the neighbors.
1375  int neighbors_left = INT_MAX, neighbors_right = 0;
1376  for (int i = 0; i < 2; ++i) {
1377  neighbors[i] = SearchNNVertical(i != 0, part);
1378  if (neighbors[i]) {
1379  const TBOX& neighbor_box = neighbors[i]->bounding_box();
1380  y_gaps[i] = neighbor_box.y_gap(part_box);
1381  if (neighbor_box.left() < neighbors_left) {
1382  neighbors_left = neighbor_box.left();
1383  }
1384  if (neighbor_box.right() > neighbors_right) {
1385  neighbors_right = neighbor_box.right();
1386  }
1387  }
1388  }
1389  if (neighbors[0] == neighbors[1]) {
1390  // This happens when part is inside neighbor.
1391  neighbors[1] = NULL;
1392  y_gaps[1] = INT_MAX;
1393  }
1394 
1395  // Check if part is within [neighbors_left, neighbors_right].
1396  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1397  return false;
1398  }
1399 
1400  // Get the index of the near one in neighbors.
1401  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1402 
1403  // Check the near one.
1404  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1405  math_blocks->push_back(neighbors[index]);
1406  } else {
1407  // If the near one failed the check, then we skip checking the far one.
1408  return false;
1409  }
1410 
1411  // Check the far one.
1412  index = 1 - index;
1413  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1414  math_blocks->push_back(neighbors[index]);
1415  }
1416 
1417  return true;
1418 }
1419 
1421  const bool search_bottom, const ColPartition* part) {
1422  ASSERT_HOST(part);
1423  ColPartition *nearest_neighbor = NULL, *neighbor = NULL;
1424  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
1425 
1427  search.SetUniqueMode(true);
1428  const TBOX& part_box(part->bounding_box());
1429  int y = search_bottom ? part_box.bottom() : part_box.top();
1430  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
1431  int min_y_gap = INT_MAX;
1432  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
1433  if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
1434  continue;
1435  }
1436  const TBOX& neighbor_box(neighbor->bounding_box());
1437  int y_gap = neighbor_box.y_gap(part_box);
1438  if (y_gap > kYGapTh) { // Out of scope.
1439  break;
1440  }
1441  if (!neighbor_box.major_x_overlap(part_box) ||
1442  (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1443  (!search_bottom && neighbor_box.top() < part_box.top())) {
1444  continue;
1445  }
1446  if (y_gap < min_y_gap) {
1447  min_y_gap = y_gap;
1448  nearest_neighbor = neighbor;
1449  }
1450  }
1451 
1452  return nearest_neighbor;
1453 }
1454 
1456  const int y_gap, const ColPartition *neighbor) const {
1457  if (!neighbor) {
1458  return false;
1459  }
1460  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
1461  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
1462 }
1463 
1465  STRING* image_name) const {
1466  ASSERT_HOST(image_name && name);
1467  char page[50];
1468  snprintf(page, sizeof(page), "%04d", page_count_);
1469  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
1470 }
1471 
1472 void EquationDetect::PaintSpecialTexts(const STRING& outfile) const {
1473  Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary();
1474  pix = pixConvertTo32(pixBi);
1476  ColPartition* part = NULL;
1477  gsearch.StartFullSearch();
1478  while ((part = gsearch.NextFullSearch()) != NULL) {
1479  BLOBNBOX_C_IT blob_it(part->boxes());
1480  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1481  RenderSpecialText(pix, blob_it.data());
1482  }
1483  }
1484 
1485  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1486  pixDestroy(&pix);
1487 }
1488 
1489 void EquationDetect::PaintColParts(const STRING& outfile) const {
1490  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
1492  gsearch.StartFullSearch();
1493  ColPartition* part = NULL;
1494  while ((part = gsearch.NextFullSearch()) != NULL) {
1495  const TBOX& tbox = part->bounding_box();
1496  Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
1497  tbox.width(), tbox.height());
1498  if (part->type() == PT_EQUATION) {
1499  pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1500  } else if (part->type() == PT_INLINE_EQUATION) {
1501  pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1502  } else {
1503  pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1504  }
1505  boxDestroy(&box);
1506  }
1507 
1508  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1509  pixDestroy(&pix);
1510 }
1511 
1513  ASSERT_HOST(part);
1514  TBOX box(part->bounding_box());
1515  int h = pixGetHeight(lang_tesseract_->BestPix());
1516  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
1517  h - box.top(), h - box.bottom());
1518  box.print();
1519  tprintf("blobs count = %d, density = ", part->boxes_count());
1520  for (int i = 0; i < BSTT_COUNT; ++i) {
1521  BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i);
1522  tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
1523  }
1524  tprintf("\n");
1525 }
1526 
1527 }; // namespace tesseract
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
const TBOX & bounding_box() const
Definition: blobbox.h:215
int count(LIST var_list)
Definition: oldlist.cpp:103
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
double x_overlap_fraction(const TBOX &box) const
Definition: rect.h:447
int source_resolution() const
const int kBlnBaselineOffset
Definition: normalis.h:29
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool IsTextOrEquationType(PolyBlockType type)
int IntCastRounded(double x)
Definition: helpers.h:172
const int kSeedBlobsCountTh
bool IsVerticalType() const
Definition: colpartition.h:435
ColPartitionGrid * part_grid_
const int kLeftIndentAlignmentCountTh
C_BLOB * cblob() const
Definition: blobbox.h:253
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
inT32 length() const
Definition: strngs.cpp:196
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
PolyBlockType type() const
Definition: colpartition.h:181
void SplitCPHor(ColPartition *part, GenericVector< ColPartition *> *parts_splitted)
BlobTextFlowType
Definition: blobbox.h:99
STRING imagebasename
Definition: ccutil.h:66
IndentType IsIndented(ColPartition *part)
inT16 width() const
Definition: rect.h:111
#define MIN(x, y)
Definition: ndminx.h:28
float SpecialBlobsDensity(const BlobSpecialTextType type) const
const float kMathDigitDensityTh2
int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
float ComputeForegroundDensity(const TBOX &tbox)
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
const float kMathItalicDensityTh
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
TBOX bounding_box() const
Definition: blobs.cpp:482
bool CheckSeedNeighborDensity(const ColPartition *part) const
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
void Absorb(ColPartition *other, WidthCallback *cb)
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int push_back(T object)
GenericVector< ColPartition * > cp_seeds_
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
int y_gap(const TBOX &box) const
Definition: rect.h:225
inT16 bottom() const
Definition: rect.h:61
void SetResolution(const int resolution)
void PaintColParts(const STRING &outfile) const
const char * string() const
Definition: strngs.cpp:201
void PrintSpecialBlobsDensity(const ColPartition *part) const
bool joined_to_prev() const
Definition: blobbox.h:241
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:536
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
int SpecialBlobsCount(const BlobSpecialTextType type)
const float kMathDigitDensityTh1
Pix * pix_binary() const
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
inT16 left() const
Definition: rect.h:68
const int kBlnXHeight
Definition: normalis.h:28
void SetPartitionType(int resolution, ColPartitionSet *columns)
bool equationdetect_save_seed_image
void delete_data_pointers()
bool equationdetect_save_merged_image
void SetLangTesseract(Tesseract *lang_tesseract)
int x_gap(const TBOX &box) const
Definition: rect.h:217
int LabelSpecialText(TO_BLOCK *to_block)
BBC * NextFullSearch()
Definition: bbgrid.h:678
inT16 height() const
Definition: rect.h:104
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN_INT32
Definition: host.h:61
bool ExpandSeed(ColPartition *seed)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
ColPartitionSet ** best_columns_
const float kUnclearDensityTh
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
float certainty() const
Definition: ratngs.h:82
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition *> *math_blocks)
BOOL8 contains(const char c) const
Definition: strngs.cpp:192
Pix * BestPix() const
int classify_integer_matcher_multiplier
Definition: classify.h:469
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
Definition: blobs.h:261
void PaintSpecialTexts(const STRING &outfile) const
int classify_class_pruner_multiplier
Definition: classify.h:465
BlobRegionType
Definition: blobbox.h:57
ColPartition * CopyButDontOwnBlobs()
inT16 top() const
Definition: rect.h:54
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
void InsertPartAfterAbsorb(ColPartition *part)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
int binary_search(const T &target) const
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
void IdentifyBlobsToSkip(ColPartition *part)
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:277
bool equationdetect_save_spt_image
inT16 fontinfo_id() const
Definition: ratngs.h:85
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
EquationDetect(const char *equ_datapath, const char *equ_language)
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
BlobTextFlowType flow() const
Definition: colpartition.h:154
bool IsLeftIndented(const EquationDetect::IndentType type)
bool IsRightIndented(const EquationDetect::IndentType type)
UNICHARSET unicharset
Definition: ccutil.h:70
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:290
PolyBlockType
Definition: publictypes.h:41
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:151
const TBOX & bounding_box() const
Definition: colpartition.h:109
ColPartition * SplitAt(int split_x)
#define BOOL_VAR(name, val, comment)
Definition: params.h:280
BlobRegionType blob_type() const
Definition: colpartition.h:148
bool CheckSeedBlobsCount(ColPartition *part)
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
BlobSpecialTextType
Definition: blobbox.h:81
bool empty() const
Definition: genericvector.h:84
void GetOutputTiffName(const char *name, STRING *image_name) const
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)
void set_type(PolyBlockType t)
Definition: colpartition.h:184
#define ASSERT_HOST(x)
Definition: errcode.h:84
void RepositionIterator()
Definition: bbgrid.h:895
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
bool equationdetect_save_bi_image
void StartFullSearch()
Definition: bbgrid.h:668
bool bool_binary_search(const T &target) const
int UNICHAR_ID
Definition: unichar.h:33