20 #define __func__ __FUNCTION__ 52 static int Epsilon(
int space_pix) {
53 return space_pix * 4 / 5;
56 static bool AcceptableRowArgs(
57 int debug_level,
int min_num_rows,
const char *function_name,
59 int row_start,
int row_end) {
60 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
61 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
62 row_start, row_end, rows->
size());
65 if (row_end - row_start < min_num_rows) {
66 if (debug_level > 1) {
67 tprintf(
"# Too few rows[%d, %d) for %s.\n",
68 row_start, row_end, function_name);
78 static STRING StrOf(
int num) {
80 snprintf(buffer,
sizeof(buffer),
"%d", num);
89 for (
int r = 0; r < rows.
size(); r++) {
90 int num_columns = rows[r].
size();
91 for (
int c = 0; c < num_columns; c++) {
93 for (
int i = 0; i < rows[r][c].
size(); i++) {
94 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
96 if (c >= max_col_widths.
size()) {
99 if (num_unicodes > max_col_widths[c])
100 max_col_widths[c] = num_unicodes;
106 for (
int c = 0; c < max_col_widths.
size(); c++) {
108 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
111 for (
int r = 0; r < rows.
size(); r++) {
112 for (
int c = 0; c < rows[r].
size(); c++) {
115 tprintf(col_width_patterns[c].
string(), rows[r][c].
string());
128 static void PrintDetectorState(
const ParagraphTheory &theory,
132 output.
back().push_back(
"#row");
133 output.
back().push_back(
"space");
134 output.
back().push_back(
"..");
135 output.
back().push_back(
"lword[widthSEL]");
136 output.
back().push_back(
"rword[widthSEL]");
138 output.
back().push_back(
"text");
140 for (
int i = 0; i < rows.
size(); i++) {
143 const RowInfo& ri = *rows[i].ri_;
145 row.
push_back(StrOf(ri.average_interword_space));
146 row.
push_back(ri.has_leaders ?
".." :
" ");
148 "[" + StrOf(ri.lword_box.width()) +
149 (ri.lword_likely_starts_idea ?
"S" :
"s") +
150 (ri.lword_likely_ends_idea ?
"E" :
"e") +
151 (ri.lword_indicates_list_item ?
"L" :
"l") +
154 "[" + StrOf(ri.rword_box.width()) +
155 (ri.rword_likely_starts_idea ?
"S" :
"s") +
156 (ri.rword_likely_ends_idea ?
"E" :
"e") +
157 (ri.rword_indicates_list_item ?
"L" :
"l") +
159 rows[i].AppendDebugInfo(theory, &row);
162 PrintTable(output,
" ");
164 tprintf(
"Active Paragraph Models:\n");
165 for (
int m = 0; m < theory.models().size(); m++) {
166 tprintf(
" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
170 static void DebugDump(
173 const ParagraphTheory &theory,
178 PrintDetectorState(theory, rows);
183 int row_start,
int row_end) {
184 tprintf(
"======================================\n");
185 for (
int row = row_start; row < row_end; row++) {
186 tprintf(
"%s\n", rows[row].ri_->text.string());
188 tprintf(
"======================================\n");
194 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
198 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
202 return strchr(
"'\"({[", ch) != NULL;
206 return strchr(
":'\".?!]})", ch) != NULL;
210 const char *
SkipChars(
const char *str,
const char *toskip) {
211 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
215 const char *
SkipChars(
const char *str,
bool (*skip)(
int)) {
216 while (*str !=
'\0' && skip(*str)) { str++; }
220 const char *
SkipOne(
const char *str,
const char *toskip) {
221 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
229 const char *kRomans =
"ivxlmdIVXLMD";
230 const char *kDigits =
"012345789";
231 const char *kOpen =
"[{(";
232 const char *kSep =
":;-.,";
233 const char *kClose =
"]})";
235 int num_segments = 0;
236 const char *pos = word.
string();
237 while (*pos !=
'\0' && num_segments < 3) {
240 const char *numeral_end =
SkipChars(numeral_start, kRomans);
241 if (numeral_end != numeral_start) {
244 numeral_end =
SkipChars(numeral_start, kDigits);
245 if (numeral_end == numeral_start) {
248 if (numeral_end - numeral_start != 1)
256 if (pos == numeral_end)
263 const char *kListMarks =
"0Oo*.,+.";
264 return word.
size() == 1 && strchr(kListMarks, word[0]) != NULL;
275 if (!u || !werd || pos > werd->
length())
285 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
303 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
314 const char *kRomans =
"ivxlmdIVXLMD";
315 while (pos < wordlen_) {
317 if (ch >= 0xF0 || strchr(kRomans, ch) == 0)
break;
324 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
362 int num_segments = 0;
364 while (pos < werd->length() && num_segments < 3) {
365 int numeral_start = m.
SkipPunc(pos);
366 if (numeral_start > pos + 1)
break;
367 int numeral_end = m.
SkipRomans(numeral_start);
368 if (numeral_end == numeral_start) {
370 if (numeral_end == numeral_start) {
372 numeral_end = m.
SkipAlpha(numeral_start);
373 if (numeral_end - numeral_start != 1)
381 if (pos == numeral_end)
384 return pos == werd->
length();
396 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
398 *starts_idea =
false;
400 if (utf8.
size() == 0 || (werd != NULL && werd->
length() == 0)) {
405 if (unicharset && werd) {
423 int start_letter = utf8[0];
430 if (start_letter >=
'A' && start_letter <=
'Z') {
443 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
445 *starts_idea =
false;
447 if (utf8.
size() == 0 || (werd != NULL && werd->
length() == 0)) {
452 if (unicharset && werd) {
466 int last_letter = utf8[utf8.
size() - 1];
477 header->
push_back(
"[lmarg,lind;rind,rmarg]");
484 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
491 int model_numbers = 0;
492 for (
int h = 0; h < hypotheses_.size(); h++) {
493 if (hypotheses_[h].model == NULL)
495 if (model_numbers > 0)
498 model_string += StrOf(1 + theory.
IndexOf(hypotheses_[h].model));
499 }
else if (hypotheses_[h].model ==
kCrownLeft) {
500 model_string +=
"CrL";
502 model_string +=
"CrR";
506 if (model_numbers == 0)
521 if (hypotheses_.empty())
523 bool has_start =
false;
524 bool has_body =
false;
525 for (
int i = 0; i < hypotheses_.size(); i++) {
526 switch (hypotheses_[i].ty) {
527 case LT_START: has_start =
true;
break;
528 case LT_BODY: has_body =
true;
break;
530 tprintf(
"Encountered bad value in hypothesis list: %c\n",
535 if (has_start && has_body)
541 if (hypotheses_.empty())
543 bool has_start =
false;
544 bool has_body =
false;
545 for (
int i = 0; i < hypotheses_.size(); i++) {
546 if (hypotheses_[i].model != model)
548 switch (hypotheses_[i].ty) {
549 case LT_START: has_start =
true;
break;
550 case LT_BODY: has_body =
true;
break;
552 tprintf(
"Encountered bad value in hypothesis list: %c\n",
557 if (has_start && has_body)
565 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
575 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
586 hypotheses_.remove(old_idx);
593 hypotheses_.remove(old_idx);
597 for (
int h = 0; h < hypotheses_.size(); h++) {
604 for (
int h = 0; h < hypotheses_.size(); h++) {
611 for (
int h = 0; h < hypotheses_.size(); h++) {
612 if (hypotheses_[h].model != NULL)
618 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START)
620 return hypotheses_[0].model;
624 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY)
626 return hypotheses_[0].model;
634 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
635 if (!models.
contains(hypotheses_[h].model)) {
636 hypotheses_.remove(h);
654 : max_cluster_width_(max_cluster_width) {}
660 int max_cluster_width_;
667 for (
int i = 0; i < clusters.
size(); i++) {
668 if (abs(value - clusters[i].center) <
669 abs(value - clusters[best_index].center))
678 for (
int i = 0; i < values_.
size();) {
682 while (++i < values_.
size() && values_[i] <= lo + max_cluster_width_) {
692 int row_start,
int row_end,
696 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
703 for (
int i = row_start; i < row_end; i++) {
704 initial_lefts.
Add((*rows)[i].lindent_);
705 initial_rights.
Add((*rows)[i].rindent_);
723 int infrequent_enough_to_ignore = 0;
724 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
725 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
727 for (
int i = row_start; i < row_end; i++) {
728 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
729 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
730 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
731 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
732 lefts.
Add((*rows)[i].lindent_);
733 rights.
Add((*rows)[i].rindent_);
739 if ((left_tabs->
size() == 1 && right_tabs->
size() >= 4) ||
740 (right_tabs->
size() == 1 && left_tabs->
size() >= 4)) {
745 for (
int i = row_start; i < row_end; i++) {
746 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
747 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
748 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
749 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
750 lefts.
Add((*rows)[i].lindent_);
751 rights.
Add((*rows)[i].rindent_);
760 if (left_tabs->
size() == 3 && right_tabs->
size() >= 4) {
762 for (
int i = left_tabs->
size() - 1; i >= 0; i--) {
764 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
769 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
770 left_tabs->
remove(to_prune);
773 if (right_tabs->
size() == 3 && left_tabs->
size() >= 4) {
775 for (
int i = right_tabs->
size() - 1; i >= 0; i--) {
777 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
782 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783 right_tabs->
remove(to_prune);
808 int row_start,
int row_end,
812 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
814 for (
int row = row_start; row < row_end; row++) {
817 if (valid_first && !valid_body) {
818 (*rows)[row].AddStartLine(model);
819 }
else if (valid_body && !valid_first) {
820 (*rows)[row].AddBodyLine(model);
821 }
else if (valid_body && valid_first) {
822 bool after_eop = (row == row_start);
823 if (row > row_start) {
824 if (eop_threshold > 0) {
826 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
828 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
836 (*rows)[row].AddStartLine(model);
838 (*rows)[row].AddBodyLine(model);
857 int r_start,
int r_end)
864 tprintf(
"Geometry: TabStop cluster tolerance = %d; " 865 "%d left tabs; %d right tabs\n",
868 ltr = (*r)[r_start].ri_->ltr;
917 void Fail(
int min_debug_level,
const char *why)
const {
990 int num_full_rows = 0;
991 int last_row_full = 0;
995 if (i == s.
row_end - 1) last_row_full++;
999 if (num_full_rows < 0.7 * num_rows) {
1000 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1013 if (debug_level > 0) {
1014 tprintf(
"# Not enough variety for clear outline classification. " 1015 "Guessing these are %s aligned based on script.\n",
1016 s.
ltr ?
"left" :
"right");
1024 if (num_rows - 1 == num_full_rows - last_row_full) {
1029 (*s.
rows)[i].AddBodyLine(model);
1079 int row_start,
int row_end,
1081 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1083 if (debug_level > 1) {
1084 tprintf(
"###############################################\n");
1085 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1086 row_start, row_end);
1087 tprintf(
"###############################################\n");
1093 s.
Fail(2,
"Too much variety for simple outline classification.");
1097 s.
Fail(1,
"Not enough variety for simple outline classification.");
1126 int firsts[2] = {0, 0};
1131 bool jam_packed =
true;
1146 int percent0firsts, percent1firsts;
1147 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1148 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1151 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1152 percent0firsts + 30 < percent1firsts) {
1155 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1156 percent1firsts + 30 < percent0firsts) {
1161 if (debug_level > 1) {
1162 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1164 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1165 s.
AlignTabs()[0].center, percent0firsts);
1166 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1167 s.
AlignTabs()[1].center, percent1firsts);
1215 for (
int i = 0; i < models_->
size(); i++) {
1216 if ((*models_)[i]->Comparable(model))
1217 return (*models_)[i];
1226 for (
int i = models_->
size() - 1; i >= 0; i--) {
1241 for (
int m = 0; m < models_->
size(); m++) {
1251 for (
int m = 0; m < models_->
size(); m++) {
1259 for (
int i = 0; i < models_->
size(); i++) {
1260 if ((*models_)[i] == model)
1269 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1273 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1274 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1280 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1284 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1291 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1312 : theory_(theory), rows_(rows), row_start_(row_start),
1314 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1320 for (
int row = row_start - 1; row <= row_end; row++) {
1321 open_models_.push_back(no_models);
1326 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1328 if (row_start < row_start_) row_start = row_start_;
1329 if (row_end > row_end_) row_end = row_end_;
1331 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1333 if ((*rows_)[row].ri_->num_words == 0) {
1334 OpenModels(row + 1) = no_models;
1337 (*rows_)[row].StartHypotheses(&opened);
1341 for (
int m = 0; m < opened.size(); m++) {
1347 still_open.push_back_new(opened[m]);
1350 OpenModels(row + 1) = still_open;
1357 CalculateOpenModels(row_start_, row_end_);
1362 for (
int i = row_start_; i < row_end_; i++) {
1371 bool left_align_open =
false;
1372 bool right_align_open =
false;
1373 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1374 switch (OpenModels(i)[m]->justification()) {
1377 default: left_align_open = right_align_open =
true;
1385 likely_start =
true;
1387 if ((left_align_open && right_align_open) ||
1388 (!left_align_open && !right_align_open)) {
1393 }
else if (left_align_open) {
1408 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1417 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1421 for (
int m = 0; m < last_line_models.
size(); m++) {
1436 for (
int m = 0; m < all_models.
size(); m++) {
1446 CalculateOpenModels(i + 1, row_end_);
1458 for (
int i = 0; i < rows.
size(); i++) {
1459 rows[i].StrongHypotheses(&used_models);
1492 for (
int end = rows->
size(); end > 0; end = start) {
1496 (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
1499 if (end == 0)
break;
1501 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1504 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1530 (*rows)[start].SetUnknown();
1531 (*rows)[start].AddStartLine(crown_model);
1532 for (
int row = start + 1; row < end; row++) {
1533 (*rows)[row].SetUnknown();
1534 (*rows)[row].AddBodyLine(crown_model);
1561 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1564 int lmin, lmax, rmin, rmax;
1565 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1566 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1567 for (
int i = start; i < end; i++) {
1575 STATS lefts(lmin, lmax + 1);
1576 STATS rights(rmin, rmax + 1);
1577 for (
int i = start; i < end; i++) {
1584 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1585 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1586 for (
int i = start; i < end; i++) {
1588 int ldelta = ignorable_left - sr.
lmargin_;
1591 int rdelta = ignorable_right - sr.
rmargin_;
1599 int row_start,
int row_end) {
1600 if (row_end < row_start + 1)
return 1;
1601 int word_height = (rows[row_start].ri_->lword_box.height() +
1602 rows[row_end - 1].ri_->lword_box.height()) / 2;
1603 int word_width = (rows[row_start].ri_->lword_box.width() +
1604 rows[row_end - 1].ri_->lword_box.width()) / 2;
1605 STATS spacing_widths(0, 5 + word_width);
1606 for (
int i = row_start; i < row_end; i++) {
1607 if (rows[i].ri_->num_words > 1) {
1608 spacing_widths.
add(rows[i].ri_->average_interword_space, 1);
1611 int minimum_reasonable_space = word_height / 3;
1612 if (minimum_reasonable_space < 2)
1613 minimum_reasonable_space = 2;
1614 int median = spacing_widths.
median();
1615 return (median > minimum_reasonable_space)
1616 ? median : minimum_reasonable_space;
1628 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1630 int available_space;
1651 int available_space = before.
lindent_;
1652 if (before.
rindent_ > available_space)
1694 int start,
int end,
int tolerance,
bool *consistent) {
1695 int ltr_line_count = 0;
1696 for (
int i = start; i < end; i++) {
1697 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1699 bool ltr = (ltr_line_count >= (end - start) / 2);
1702 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1707 int lmargin = (*rows)[start].lmargin_;
1708 int rmargin = (*rows)[start].rmargin_;
1709 int lmin, lmax, rmin, rmax, cmin, cmax;
1710 lmin = lmax = (*rows)[start + 1].lindent_;
1711 rmin = rmax = (*rows)[start + 1].rindent_;
1713 for (
int i = start + 1; i < end; i++) {
1714 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1715 tprintf(
"Margins don't match! Software error.\n");
1716 *consistent =
false;
1721 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1723 int ldiff = lmax - lmin;
1724 int rdiff = rmax - rmin;
1725 int cdiff = cmax - cmin;
1726 if (rdiff > tolerance && ldiff > tolerance) {
1727 if (cdiff < tolerance * 2) {
1728 if (end - start < 3)
1732 *consistent =
false;
1735 if (end - start < 3)
1740 bool body_admits_left_alignment = ldiff < tolerance;
1741 bool body_admits_right_alignment = rdiff < tolerance;
1745 (lmin + lmax) / 2, tolerance);
1748 (rmin + rmax) / 2, tolerance);
1752 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1753 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1758 if (tolerance < rdiff) {
1759 if (body_admits_left_alignment && text_admits_left_alignment)
1761 *consistent =
false;
1764 if (tolerance < ldiff) {
1765 if (body_admits_right_alignment && text_admits_right_alignment)
1767 *consistent =
false;
1775 int first_left = (*rows)[start].lindent_;
1776 int first_right = (*rows)[start].rindent_;
1778 if (ltr && body_admits_left_alignment &&
1779 (first_left < lmin || first_left > lmax))
1781 if (!ltr && body_admits_right_alignment &&
1782 (first_right < rmin || first_right > rmax))
1785 *consistent =
false;
1796 int start,
int end,
int tolerance) {
1797 bool unused_consistent;
1799 rows, start, end, tolerance, &unused_consistent);
1801 tprintf(
"Could not determine a model for this paragraph:\n");
1802 PrintRowRange(*rows, start, end);
1810 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1813 for (
int i = start + 1 ; i < end; i++) {
1831 int row_start,
int row_end) {
1833 for (
int i = row_start + 1; i < row_end; i++) {
1871 for (
int i = row_start + 1; i < row_end - 1; i++) {
1902 int row_start,
int row_end,
1903 bool allow_flush_models,
1905 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1908 int start = row_start;
1909 while (start < row_end) {
1910 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1912 if (start >= row_end - 1)
1915 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1918 bool next_consistent;
1924 if (end < row_end - 1) {
1927 next_consistent = lt ==
LT_BODY ||
1931 next_consistent =
false;
1933 if (next_consistent) {
1935 rows, start, end + 1, tolerance, &next_consistent);
1936 if (((*rows)[start].ri_->ltr &&
1939 (!(*rows)[start].ri_->ltr &&
1942 next_consistent =
false;
1944 last_model = next_model;
1946 next_consistent =
false;
1948 }
while (next_consistent && end < row_end);
1952 if (end > start + 1) {
1956 debug_level, rows, start, end,
1961 if (end == start + 2) {
1964 }
else if (start == row_start) {
1971 }
else if (allow_flush_models) {
1972 model = theory->
AddModel(new_model);
1975 model = theory->
AddModel(new_model);
1978 (*rows)[start].AddStartLine(model);
1979 for (
int i = start + 1; i < end; i++) {
1980 (*rows)[i].AddBodyLine(model);
1997 int row_start,
int row_end,
1999 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2002 if (debug_level > 1) {
2003 tprintf(
"#############################################\n");
2004 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2005 tprintf(
"#############################################\n");
2011 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2016 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2026 int row_start,
int row_end,
2028 for (
int i = row_start + 1; i < row_end - 1; i++) {
2029 if ((*rows)[i - 1].ri_->has_leaders &&
2030 (*rows)[i].ri_->has_leaders &&
2031 (*rows)[i + 1].ri_->has_leaders) {
2034 (*rows)[i].AddStartLine(model);
2046 int end = rows.
size();
2048 for (; end > 0; end = start) {
2052 bool single_line_paragraph =
false;
2054 rows[start].NonNullHypotheses(&models);
2055 if (!models.
empty()) {
2057 if (rows[start].GetLineType(model) !=
LT_BODY)
2058 single_line_paragraph =
true;
2060 if (model && !single_line_paragraph) {
2062 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2065 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2069 if (model == NULL) {
2079 for (
int row = end; row < rows.
size(); row++) {
2080 if ((*row_owners)[row] &&
2084 model = (*row_owners)[row]->model;
2092 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2097 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2100 rows[start].SetUnknown();
2101 rows[start].AddStartLine(model);
2102 for (
int i = start + 1; i < end; i++) {
2103 rows[i].SetUnknown();
2104 rows[i].AddBodyLine(model);
2110 ? rows[start].ri_->rword_indicates_list_item
2111 : rows[start].ri_->lword_indicates_list_item;
2112 for (
int row = start; row < end; row++) {
2113 if ((*row_owners)[row] != NULL) {
2114 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " 2115 "more than once!\n");
2116 delete (*row_owners)[row];
2118 (*row_owners)[row] = p;
2142 rows[row].StrongHypotheses(&row_models);
2144 for (
int m = 0; m < row_models.
size(); m++) {
2145 bool all_starts = rows[row].GetLineType();
2147 bool continues =
true;
2148 for (
int i = row - 1; i >= 0 && continues; i--) {
2150 rows[i].NonNullHypotheses(&models);
2151 switch (rows[i].GetLineType(row_models[m])) {
2152 case LT_START: run_length++;
break;
2154 case LT_BODY: run_length++; all_starts =
false;
break;
2156 default: continues =
false;
2160 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2162 rows[i].NonNullHypotheses(&models);
2163 switch (rows[i].GetLineType(row_models[m])) {
2164 case LT_START: run_length++;
break;
2166 case LT_BODY: run_length++; all_starts =
false;
break;
2168 default: continues =
false;
2171 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2184 int row_start,
int row_end) {
2186 for (
int i = row_start; i < row_end; i++) {
2187 bool needs_fixing =
false;
2191 rows[i].StrongHypotheses(&models);
2192 rows[i].NonNullHypotheses(&models_w_crowns);
2193 if (models.
empty() && !models_w_crowns.
empty()) {
2195 for (
int end = i + 1; end < rows.
size(); end++) {
2198 rows[end].NonNullHypotheses(&end_models);
2199 rows[end].StrongHypotheses(&strong_end_models);
2200 if (end_models.
empty()) {
2201 needs_fixing =
true;
2203 }
else if (!strong_end_models.
empty()) {
2204 needs_fixing =
false;
2208 }
else if (models.
empty() && rows[i].ri_->num_words > 0) {
2210 needs_fixing =
true;
2213 if (!needs_fixing && !models.
empty()) {
2225 for (
int i = 0; i < to_fix->
size(); i++) {
2226 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2235 PARA_LIST *paragraphs) {
2237 paragraphs->
clear();
2238 PARA_IT out(paragraphs);
2239 PARA *formerly_null = NULL;
2240 for (
int i = 0; i < rows.
size(); i++) {
2241 if (rows[i] == NULL) {
2242 if (i == 0 || rows[i - 1] != formerly_null) {
2243 rows[i] = formerly_null =
new PARA();
2245 rows[i] = formerly_null;
2248 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2251 out.add_after_then_move(rows[i]);
2268 PARA_LIST *paragraphs,
2278 for (
int i = 0; i < row_infos->
size(); i++) {
2279 rows[i].Init((*row_infos)[i]);
2289 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2293 for (
int i = 0; i < leftovers.
size(); i++) {
2300 leftovers[i].begin, leftovers[i].end, &theory);
2308 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2309 (leftovers2.
size() == 1 &&
2310 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2311 if (pass2a_was_useful) {
2312 for (
int j = 0; j < leftovers2.
size(); j++) {
2314 leftovers2[j].begin, leftovers2[j].end,
2320 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2327 for (
int i = 0; i < leftovers.
size(); i++) {
2329 leftovers[i].begin, leftovers[i].end, &theory);
2335 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2340 for (
int i = 0; i < leftovers.
size(); i++) {
2341 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2342 rows[j].SetUnknown();
2346 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2352 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2364 PageIterator pit(static_cast<const PageIterator&>(it));
2365 bool first_word =
true;
2380 if (fake_text.
size() == 0)
return;
2383 for (
int i = 0; i < lspaces; i++) {
2386 info->
text += fake_text;
2398 if (!lword) lword = word_res;
2399 if (rword != word_res) info->
num_words++;
2402 word_res = page_res_it.
forward();
2403 }
while (page_res_it.
row() == this_row);
2444 if (!after_recognition) {
2450 int trailing_ws_idx = strlen(text);
2451 while (trailing_ws_idx > 0 &&
2453 ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2454 isspace(text[trailing_ws_idx - 1]))
2456 if (trailing_ws_idx > 0) {
2458 for (
int i = 0; i < lspaces; i++)
2460 for (
int i = 0; i < trailing_ws_idx; i++)
2461 info->
text += text[i];
2473 int num_leaders = 0;
2483 word_res = page_res_it.
forward();
2484 }
while (page_res_it.
row() == this_row);
2485 info->
ltr = ltr >= rtl;
2488 if (!werds.
empty()) {
2489 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2493 info->
rword_box = rword->word->bounding_box();
2511 bool after_text_recognition,
2541 if (!row_infos.
empty()) {
2542 int min_lmargin = row_infos[0].pix_ldistance;
2543 int min_rmargin = row_infos[0].pix_rdistance;
2544 for (
int i = 1; i < row_infos.
size(); i++) {
2545 if (row_infos[i].pix_ldistance < min_lmargin)
2546 min_lmargin = row_infos[i].pix_ldistance;
2547 if (row_infos[i].pix_rdistance < min_rmargin)
2548 min_rmargin = row_infos[i].pix_rdistance;
2550 if (min_lmargin > 0 || min_rmargin > 0) {
2551 for (
int i = 0; i < row_infos.
size(); i++) {
2552 row_infos[i].pix_ldistance -= min_lmargin;
2553 row_infos[i].pix_rdistance -= min_rmargin;
2561 if (!is_image_block) {
2571 for (
int i = 0; i < row_owners.
size(); i++) {
const PAGE_RES_IT * PageResIt() const
const ParagraphModel * UniqueStartHypothesis() const
ParagraphModel ParagraphModelByOutline(int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void AddBodyLine(const ParagraphModel *model)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
bool NearlyEqual(T x, T y, T tolerance)
bool lword_likely_starts_idea
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
const ParagraphModel * UniqueBodyHypothesis() const
const STRING & unichar_string() const
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
UNICHAR_ID unichar_id(int index) const
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
virtual bool Next(PageIteratorLevel level)
const char * SkipChars(const char *str, const char *toskip)
void DiscardUnusedModels(const SetOfModels &used_models)
void AssumeLeftJustification()
LineType GetLineType() const
void add(inT32 value, inT32 count)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * kCrownLeft
const GenericVector< Cluster > & AlignTabs() const
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void ConvertHypothesizedModelRunsToParagraphs(int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA *> *row_owners, ParagraphTheory *theory)
void GetClusters(GenericVector< Cluster > *clusters)
bool FirstWordWouldHaveFit(int row_a, int row_b)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool contains(T object) const
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
ParagraphModel InternalParagraphModelByOutline(const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
virtual char * GetUTF8Text(PageIteratorLevel level) const
int get_index(T object) const
int push_back_new(T object)
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
void SeparateSimpleLeaderLines(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
STRING RtlEmbed(const STRING &word, bool rtlify)
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
WERD_CHOICE * best_choice
int average_interword_space
bool IsFullRow(int i) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after)
void Fail(int min_debug_level, const char *why) const
void StrongEvidenceClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
POLY_BLOCK * poly_block() const
int AlignsideTabIndex(int row_idx) const
SimpleClusterer(int max_cluster_width)
void GeometricClassify(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
const ParagraphModel * model
void ModelStrongEvidence(int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
const char * string() const
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
void AddStartLine(const ParagraphModel *model)
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
bool rword_indicates_list_item
bool is_very_first_or_continuation
bool AnyLtrCharsInWord() const
bool lword_likely_ends_idea
int ClosestCluster(const GenericVector< Cluster > &clusters, int value)
bool IsOpeningPunct(int ch)
BOOL8 flag(WERD_FLAGS mask) const
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
bool has_drop_cap() const
tesseract::ParagraphJustification just
void LeftoverSegments(const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
bool AnyRtlCharsInWord() const
const ParagraphModel * AddModel(const ParagraphModel &model)
void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
void Init(const RowInfo &row)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool Empty(PageIteratorLevel level) const
bool RowIsStranded(const GenericVector< RowScratchRegisters > &rows, int row)
bool IsTerminalPunct(int ch)
virtual bool Next(PageIteratorLevel level)
tesseract::ParagraphJustification justification() const
void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info)
void StartHypotheses(SetOfModels *models) const
GenericVector< RowScratchRegisters > * rows
BLOCK_RES * block() const
GenericVector< Cluster > left_tabs
void StrongHypotheses(SetOfModels *models) const
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info)
void NonCenteredModels(SetOfModels *models)
void CalculateTabStops(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
int OffsideIndent(tesseract::ParagraphJustification just) const
bool StrongModel(const ParagraphModel *model)
const UNICHARSET * uch_set
void NonNullHypotheses(SetOfModels *models) const
GenericVectorEqEq< const ParagraphModel * > SetOfModels
const char * SkipOne(const char *str, const char *toskip)
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
bool AsciiLikelyListItem(const STRING &word)
bool rword_likely_starts_idea
const ParagraphModel * kCrownRight
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
void MarkStrongEvidence(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
GenericVector< ParagraphModel * > & models()
void DiscardNonMatchingHypotheses(const SetOfModels &models)
bool IsLatinLetter(int ch)
bool LikelyListMarkUnicode(int ch)
double ile(double frac) const
bool get_isdigit(UNICHAR_ID unichar_id) const
void AssumeRightJustification()
const char * id_to_unichar(UNICHAR_ID id) const
void MarkRowsWithModel(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
bool LikelyListNumeral(const STRING &word)
void init_to_size(int size, T t)
const GenericVector< Cluster > & OffsideTabs() const
GenericVector< Cluster > right_tabs
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
ParagraphModel Model() const
Cluster(int cen, int num)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool lword_indicates_list_item
TBOX bounding_box() const
bool rword_likely_ends_idea
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool LikelyListMark(const STRING &word)