36 static const double kStopperAmbiguityThresholdGain = 8.0;
39 static const double kStopperAmbiguityThresholdOffset = 1.5;
53 static double StopperAmbigThreshold(
double f1,
double f2) {
54 return (f2 - f1) * kStopperAmbiguityThresholdGain -
55 kStopperAmbiguityThresholdOffset;
64 bool merge_similar_words,
65 BLOCK_LIST *the_block_list,
68 BLOCK_IT block_it(the_block_list);
70 for (block_it.mark_cycle_pt();
71 !block_it.cycled_list(); block_it.forward()) {
72 block_res_it.add_to_end(
new BLOCK_RES(merge_similar_words,
85 ROW_IT row_it (the_block->
row_list ());
99 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
100 row_res_it.add_to_end(
new ROW_RES(merge_similar_words, row_it.data()));
121 bool add_next_word =
false;
125 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
133 }
else if (merge_similar_words) {
137 word_res->
odd_size = !add_next_word;
139 WERD* next_word = word_it.data_relative(1);
140 if (merge_similar_words) {
148 int prev_right = union_box.
right();
149 union_box += next_box;
153 add_next_word =
false;
162 copy_word =
new WERD;
163 *copy_word = *(word_it.data());
167 word_res_it.add_to_end(combo);
173 word_res_it.add_to_end(word_res);
206 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.
best_choices));
208 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
210 wc_dest_it.add_after_then_move(
new WERD_CHOICE(*choice));
212 if (!wc_dest_it.empty()) {
213 wc_dest_it.move_to_first();
297 const TBOX* norm_box,
300 bool allow_detailed_fx,
317 float word_xheight = use_body_size && row != NULL && row->
body_size() > 0.0f
321 norm_mode_hint, norm_box, &
denorm);
351 if (blob_count > 0) {
357 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
358 TBOX box = b_it.data()->bounding_box();
363 delete [] fake_choices;
395 for (
int b = 0; b < num_blobs; ++b) {
399 if (b + 1 < num_blobs) {
420 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
432 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&
best_choices));
433 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
452 tprintf(
"raw_choice has total of states = %d vs ratings dim of %d\n",
458 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
461 tprintf(
"Cooked #%d has total of states = %d vs ratings dim of %d\n",
473 (word_to_debug != NULL && *word_to_debug !=
'\0' &&
best_choice != NULL &&
480 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
491 tprintf(
"Best choice: accepted=%d, adaptable=%d, done=%d : ",
508 if (debug_level >= 2)
512 for (it.forward(); !it.at_first(); it.forward(), ++index) {
521 int i = 0, j = 0, chunk = 0;
527 while (i < choice->length() && j < best_choice->length()) {
530 if (debug_level >= 2) {
532 label.
add_str_int(
"\nDiscarding bad choice #", index);
534 tprintf(
"i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g" 535 " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n",
544 while (choice_chunk < chunk && ++i < choice->length())
545 choice_chunk += choice->
state(i);
547 while (best_chunk < chunk && ++j < best_choice->length())
563 float avg_rating = 0.0f;
564 int num_error_chunks = 0;
567 while (chunk < end_chunk) {
568 if (chunk >= end_raw_chunk) {
580 if (num_error_chunks > 0) {
581 avg_rating /= num_error_chunks;
582 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
584 *thresholds = max_rating;
587 if (*thresholds > max_rating)
588 *thresholds = max_rating;
589 if (*thresholds < min_rating)
590 *thresholds = min_rating;
620 float max_certainty_delta =
623 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
624 max_certainty_delta = -kStopperAmbiguityThresholdOffset;
626 max_certainty_delta) {
630 tprintf(
"Discarding choice \"%s\" with an overly low certainty" 631 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
645 bool inserted =
false;
650 if (choice->
rating() > word_choice->
rating() && !inserted) {
652 it.add_before_stay_put(word_choice);
654 if (num_choices == 0)
665 tprintf(
"Discarding duplicate choice \"%s\", rating %g vs %g\n",
673 if (num_choices > max_num_choices)
677 }
while (!it.at_first());
679 if (!inserted && num_choices < max_num_choices) {
680 it.add_to_end(word_choice);
682 if (num_choices == 0)
690 word_choice->
print(
" Word Choice");
702 template<
class T>
static void MovePointerData(T** dest, T**src) {
711 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&
best_choices));
712 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
713 if (!it.at_first()) alternates_str +=
"\", \"";
714 alternates_str += it.data()->unichar_string();
716 tprintf(
"Alternates for \"%s\": {\"%s\"}\n",
724 for (
int b = start_blob; b <= last_blob; ++b) {
765 word->seam_array.clear();
776 wc_it.add_list_after(&
word->best_choices);
778 if (
word->blamer_bundle != NULL) {
836 for (
int i = 0; i < word_len; ++i) {
863 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
864 wc_it.data()->SetAllScriptPositions(position);
879 for (
int c = 0; c < blob_count; ++c) {
880 BLOB_CHOICE_LIST* choice_list =
new BLOB_CHOICE_LIST;
881 BLOB_CHOICE_IT choice_it(choice_list);
882 choice_it.add_after_then_move(choices[c]);
896 for (
int b = 0; b < num_blobs; ++b) {
900 BLOB_CHOICE_LIST* choices =
ratings->
get(b, b);
901 if (choices != NULL && !choices->empty()) {
902 BLOB_CHOICE_IT bc_it(choices);
905 rating = choice->
rating();
936 bool modified =
false;
940 if (new_id != INVALID_UNICHAR_ID &&
956 BLOB_CHOICE_IT bc_it(blob_choices);
957 bc_it.add_before_then_move(blob_choice);
986 static int is_simple_quote(
const char* signed_str,
int length) {
987 const unsigned char* str =
988 reinterpret_cast<const unsigned char*
>(signed_str);
990 return (length == 1 && (*str ==
'\'' || *str ==
'`')) ||
992 (length == 3 && ((*str == 0xe2 &&
993 *(str + 1) == 0x80 &&
994 *(str + 2) == 0x98) ||
996 *(str + 1) == 0x80 &&
997 *(str + 2) == 0x99)));
1005 if (is_simple_quote(ch, strlen(ch)) &&
1006 is_simple_quote(next_ch, strlen(next_ch)))
1008 return INVALID_UNICHAR_ID;
1027 if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1028 (*ch ==
'-' || *ch ==
'~') && (*next_ch ==
'-' || *next_ch ==
'~'))
1030 return INVALID_UNICHAR_ID;
1057 return INVALID_UNICHAR_ID;
1074 for (
int index = start; index < start +
count - 1; ++index) {
1077 if (seam != NULL && seam->
HasAnySplits())
return false;
1196 return word_res == other.word_res &&
1197 row_res == other.row_res &&
1198 block_res == other.block_res;
1203 if (other.block_res == NULL) {
1205 if (block_res == NULL)
1209 if (block_res == NULL) {
1212 if (block_res == other.block_res) {
1213 if (other.row_res == NULL || row_res == NULL) {
1217 if (row_res == other.row_res) {
1219 ASSERT_HOST(other.word_res != NULL && word_res != NULL);
1220 if (word_res == other.word_res) {
1226 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1227 word_res_it.forward()) {
1228 if (word_res_it.data() == word_res) {
1230 }
else if (word_res_it.data() == other.word_res) {
1234 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" == NULL);
1239 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1240 row_res_it.forward()) {
1241 if (row_res_it.data() == row_res) {
1243 }
else if (row_res_it.data() == other.row_res) {
1247 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" == NULL);
1252 for (block_res_it.mark_cycle_pt();
1253 !block_res_it.cycled_list(); block_res_it.forward()) {
1254 if (block_res_it.data() == block_res) {
1256 }
else if (block_res_it.data() == other.block_res) {
1261 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" == NULL);
1276 WERD_RES_IT wr_it(&
row()->word_res_list);
1277 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1279 if (
word == word_res)
1283 wr_it.add_before_then_move(new_res);
1284 if (wr_it.at_first()) {
1295 static void ComputeBlobEnds(
const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
1301 TBOX blob_box = blob_it.data()->bounding_box();
1303 for (
int b = 1; b < length; ++b) {
1304 blob_box += blob_it.data()->bounding_box();
1310 if (!blob_it.at_first() || next_word_blobs != NULL) {
1311 if (blob_it.at_first())
1312 blob_it.set_to_list(next_word_blobs);
1313 blob_end = (blob_box.
right() + blob_it.data()->bounding_box().left()) / 2;
1324 if (words->
empty()) {
1331 (*words)[0]->word->set_flag(
W_BOL,
true);
1333 (*words)[0]->word->set_blanks(1);
1343 WERD_IT w_it(
row()->
row->word_list());
1345 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1354 WERD_RES_IT wr_it(&
row()->word_res_list);
1355 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1357 if (
word == input_word)
1367 for (
int w = 0; w < words->
size(); ++w) {
1371 C_BLOB_LIST* next_word_blobs =
1372 w + 1 < words->
size() ? (*words)[w + 1]->word->cblob_list() : NULL;
1373 ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
1379 for (
int i = 0; i < blob_ends.
size(); ++i) {
1380 int end_x = blob_ends[i];
1383 while (!src_b_it.empty() &&
1384 src_b_it.data()->bounding_box().x_middle() < end_x) {
1385 blob_box += src_b_it.data()->bounding_box();
1386 dest_it.add_after_then_move(src_b_it.extract());
1389 while (!rej_b_it.empty() &&
1390 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1391 blob_box += rej_b_it.data()->bounding_box();
1392 dest_it.add_after_then_move(rej_b_it.extract());
1398 if (i > 0 && blob_box.
left() < blob_ends[i - 1])
1399 blob_box.
set_left(blob_ends[i - 1]);
1400 if (blob_box.
right() > end_x)
1402 box_word->InsertBox(i, blob_box);
1407 for (
int i = 0; i < box_word->length(); ++i) {
1408 TBOX box = box_word->BlobBox(i);
1412 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
1413 dest_it.forward()) {
1414 TBOX blob_box = dest_it.data()->bounding_box();
1415 if (blob_box.
left() < blob_ends[i] &&
1416 (i == 0 || blob_box.
right() >= blob_ends[i - 1])) {
1417 if (i > 0 && blob_box.
left() < blob_ends[i - 1])
1418 blob_box.
set_left(blob_ends[i - 1]);
1419 if (blob_box.
right() > blob_ends[i])
1421 box_word->ChangeBox(i, blob_box);
1432 w_it.add_before_stay_put(word_w->
word);
1436 wr_it.add_before_stay_put(word_w);
1444 delete w_it.extract();
1445 delete wr_it.extract();
1458 WERD_IT w_it(
row()->
row->word_list());
1459 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1460 if (w_it.data() == word_res->
word) {
1465 delete w_it.extract();
1469 WERD_RES_IT wr_it(&
row()->word_res_list);
1470 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1471 if (wr_it.data() == word_res) {
1477 delete wr_it.extract();
1490 WERD_RES_IT wr_it(&
row()->word_res_list);
1491 for (wr_it.mark_cycle_pt();
1492 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1496 real_word = wr_it.data()->word;
1512 block_res_it.mark_cycle_pt();
1513 prev_block_res = NULL;
1514 prev_row_res = NULL;
1515 prev_word_res = NULL;
1519 next_block_res = NULL;
1520 next_row_res = NULL;
1521 next_word_res = NULL;
1522 internal_forward(
true, empty_ok);
1523 return internal_forward(
false, empty_ok);
1534 if (row_res == next_row_res) {
1537 word_res_it.move_to_first();
1538 for (word_res_it.mark_cycle_pt();
1539 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1540 word_res_it.forward()) {
1541 if (!word_res_it.data()->part_of_combo) {
1542 if (prev_row_res == row_res) prev_word_res = word_res;
1543 word_res = word_res_it.data();
1547 word_res_it.forward();
1551 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1552 if (!wr_it.data()->part_of_combo) {
1553 if (prev_row_res == row_res) prev_word_res = word_res;
1554 word_res = wr_it.data();
1575 WERD_RES *PAGE_RES_IT::internal_forward(
bool new_block,
bool empty_ok) {
1576 bool new_row =
false;
1578 prev_block_res = block_res;
1579 prev_row_res = row_res;
1580 prev_word_res = word_res;
1581 block_res = next_block_res;
1582 row_res = next_row_res;
1583 word_res = next_word_res;
1584 next_block_res = NULL;
1585 next_row_res = NULL;
1586 next_word_res = NULL;
1588 while (!block_res_it.cycled_list()) {
1591 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1592 row_res_it.mark_cycle_pt();
1593 if (row_res_it.empty() && empty_ok) {
1594 next_block_res = block_res_it.data();
1599 while (!row_res_it.cycled_list()) {
1602 word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1603 word_res_it.mark_cycle_pt();
1606 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1607 word_res_it.forward();
1608 if (!word_res_it.cycled_list()) {
1609 next_block_res = block_res_it.data();
1610 next_row_res = row_res_it.data();
1611 next_word_res = word_res_it.data();
1612 word_res_it.forward();
1616 row_res_it.forward();
1620 block_res_it.forward();
1627 (new_block || prev_word_res == NULL) ? NULL : prev_word_res->
best_choice;
1639 if (!row)
return NULL;
1653 while (block_res == next_block_res &&
1654 (next_row_res != NULL && next_row_res->
row != NULL &&
1656 internal_forward(
false,
true);
1658 return internal_forward(
false,
true);
1668 while (block_res == next_block_res) {
1669 internal_forward(
false,
true);
1671 return internal_forward(
false,
true);
1675 inT16 chars_in_word;
1676 inT16 rejects_in_word = 0;
1686 block_res->
rej_count += rejects_in_word;
1688 if (chars_in_word == rejects_in_word)
#define ELISTIZE(CLASSNAME)
bool PiecesAllNatural(int start, int count) const
bool operator==(const PAGE_RES_IT &other) const
void start_seam_list(TWERD *word, GenericVector< SEAM *> *seam_array)
void InsertSeam(int blob_number, SEAM *seam)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
void remove_unichar_id(int index)
void PrintBestChoices() const
CRUNCH_MODE unlv_crunch_mode
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
void IncreaseBandSize(int bandwidth)
void UpdateStateForSplit(int blob_position)
int cmp(const PAGE_RES_IT &other) const
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
GenericVector< TBLOB * > blobs
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
void InsertBox(int index, const TBOX &box)
bool TESS_API contains_unichar(const char *const unichar_repr) const
const STRING & unichar_string() const
void SetupNormTruthWord(const DENORM &denorm)
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX &> *box_cb)
UNICHAR_ID unichar_id(int index) const
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
void DebugTopChoice(const char *msg) const
float adjust_factor() const
static int SortByXMiddle(const void *v1, const void *v2)
void initialise(inT16 length)
void insert(T t, int index)
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
void operator=(const ELIST_LINK &)
const FontInfo * fontinfo2
int state(int index) const
void set_unichar_id(UNICHAR_ID unichar_id, int index)
void ConsumeWordResults(WERD_RES *word)
void SetupBlobWidthsAndGaps()
void MergeAdjacentBlobs(int index)
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
void set_permuter(uinT8 perm)
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
void DebugWordChoices(bool debug, const char *word_to_debug)
ROW_LIST * row_list()
get rows
int GetBlobsWidth(int start_blob, int last_blob)
void remove_pos(inT16 pos)
void ReplaceBestChoice(WERD_CHOICE *choice)
WERD_RES_LIST word_res_list
void SetAllScriptPositions(tesseract::ScriptPos position)
void move(GenericVector< T > *from)
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
WERD_RES * restart_page()
WERD_CHOICE * best_choice
TBOX bounding_box() const
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
bool Valid(const MATRIX &m) const
void CopySimpleFields(const WERD_RES &source)
void SetScriptPositions()
UNICHAR_ID unichar_id() const
void add_str_int(const char *str, int number)
GenericVector< SEAM * > seam_array
void MergeBlobs(int start, int end)
WERD_RES * forward_paragraph()
POLY_BLOCK * poly_block() const
void SetupFake(const UNICHARSET &uch)
WERD_RES * forward_block()
tesseract::BoxWord * box_word
const char * string() const
bool HasAnySplits() const
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
bool dangerous_ambig_found() const
const FontInfo * fontinfo
MATRIX * ConsumeAndMakeBigger(int ind)
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
BOOL8 flag(WERD_FLAGS mask) const
inT32 whole_word_rej_count
void put(ICOORD pos, const T &thing)
const BLOCK * block() const
tesseract::BoxWord * bln_boxes
GenericVector< int > blob_gaps
BlamerBundle * blamer_bundle
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
MATRIX_COORD MatrixCoord(int index) const
void set_unichar_id(UNICHAR_ID newunichar_id)
void MakeCurrentWordFuzzy()
void delete_data_pointers()
void SetAllScriptPositions(tesseract::ScriptPos position)
BLOB_CHOICE * GetBlobChoice(int index) const
int TotalOfStates() const
bool LogNewRawChoice(WERD_CHOICE *word_choice)
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
void InitForRetryRecognition(const WERD_RES &source)
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
void MergeBoxes(int start, int end)
WERD_CHOICE_LIST best_choices
bool PrepareToInsertSeam(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int insert_index, bool modify)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
GenericVector< STRING > correct_text
void CopyResults(const BlamerBundle &other)
int GetBlobsGap(int blob_index)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
void SetupWordScript(const UNICHARSET &unicharset_in)
void SetScriptPositions(bool small_caps, TWERD *word)
const double kMaxLineSizeRatio
const UNICHARSET * uch_set
const TBOX & BlobBox(int index) const
void delete_matrix_pointers()
void FilterWordChoices(int debug_level)
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
tesseract::Tesseract * tesseract
void ComputeBoundingBoxes()
void CopyTruth(const BlamerBundle &other)
ROW_RES_LIST row_res_list
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
const int kWordrecMaxNumJoinChunks
WERD_RES * start_page(bool empty_ok)
bool script_has_xheight() const
GenericVector< int > blob_widths
static BoxWord * CopyFromNormalized(TWERD *tessword)
void copy_on(WERD_RES *word_res)
GenericVector< int > best_state
const char * id_to_unichar(UNICHAR_ID id) const
bool get_enabled(UNICHAR_ID unichar_id) const
void set_script_id(int id)
void set_flag(WERD_FLAGS mask, BOOL8 value)
void CloneChoppedToRebuild()
C_BLOB_LIST * cblob_list()
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
void FakeWordFromRatings(PermuterType permuter)
WERD_CHOICE ** prev_word_best_choice
const double kMaxWordSizeRatio
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
void BestChoiceToCorrectText()
TBOX bounding_box() const
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
BLOCK_RES_LIST block_res_list
const double kMaxWordGapRatio
C_BLOB_LIST * rej_cblob_list()
WERD_RES & operator=(const WERD_RES &source)