tesseract  3.05.02
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Detailed Description

Definition at line 42 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 44 of file language_model.cpp.

46  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
47  dict->getCCUtil()->params()),
49  "Turn on/off the use of character ngram model",
50  dict->getCCUtil()->params()),
52  "Maximum order of the character ngram model",
53  dict->getCCUtil()->params()),
55  "Maximum number of prunable (those for which"
56  " PrunablePath() is true) entries in each viterbi list"
57  " recorded in BLOB_CHOICEs",
58  dict->getCCUtil()->params()),
60  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
61  dict->getCCUtil()->params()),
63  "To avoid overly small denominators use this as the "
64  "floor of the probability returned by the ngram model.",
65  dict->getCCUtil()->params()),
67  "Average classifier score of a non-matching unichar.",
68  dict->getCCUtil()->params()),
70  "Use only the first UTF8 step of the given string"
71  " when computing log probabilities.",
72  dict->getCCUtil()->params()),
74  "Strength of the character ngram model relative to the"
75  " character classifier ",
76  dict->getCCUtil()->params()),
78  "Factor to bring log-probs into the same range as ratings"
79  " when multiplied by outline length ",
80  dict->getCCUtil()->params()),
82  "Words are delimited by space",
83  dict->getCCUtil()->params()),
85  "Minimum length of compound words",
86  dict->getCCUtil()->params()),
88  "Penalty for words not in the frequent word dictionary",
89  dict->getCCUtil()->params()),
91  "Penalty for non-dictionary words",
92  dict->getCCUtil()->params()),
94  "Penalty for inconsistent punctuation",
95  dict->getCCUtil()->params()),
97  "Penalty for inconsistent case",
98  dict->getCCUtil()->params()),
100  "Penalty for inconsistent script",
101  dict->getCCUtil()->params()),
103  "Penalty for inconsistent character type",
104  dict->getCCUtil()->params()),
105  // TODO(daria, rays): enable font consistency checking
106  // after improving font analysis.
108  "Penalty for inconsistent font",
109  dict->getCCUtil()->params()),
111  "Penalty for inconsistent spacing",
112  dict->getCCUtil()->params()),
114  "Penalty increment",
115  dict->getCCUtil()->params()),
116  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
117  dict->getCCUtil()->params()),
119  "Use sigmoidal score for certainty",
120  dict->getCCUtil()->params()),
121  dawg_args_(NULL, new DawgPositionVector(), NO_PERM),
122  fontinfo_table_(fontinfo_table), dict_(dict),
123  fixed_pitch_(false), max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
126 }
double language_model_penalty_non_dict_word
int language_model_viterbi_list_max_num_prunable
const UnicityTable< FontInfo > * fontinfo_table_
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
bool language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_space_delimited_language
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
double language_model_ngram_nonmatch_score
#define ASSERT_HOST(x)
Definition: errcode.h:84
double language_model_penalty_non_freq_dict_word

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 128 of file language_model.cpp.

128  {
129  delete dawg_args_.updated_dawgs;
130 }
DawgPositionVector * updated_dawgs
Definition: dict.h:81

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 95 of file language_model.h.

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 301 of file language_model.h.

301  {
302  return (vse.dawg_info != NULL || vse.Consistent() ||
303  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
304  }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 557 of file language_model.cpp.

568  {
569  ViterbiStateEntry_IT vit;
570  if (language_model_debug_level > 1) {
571  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
572  " certainty=%.4f top_choice_flags=0x%x",
574  b->rating(), b->certainty(), top_choice_flags);
576  tprintf(" parent_vse=%p\n", parent_vse);
577  else
578  tprintf("\n");
579  }
580  // Check whether the list is full.
581  if (curr_state != NULL &&
582  curr_state->viterbi_state_entries_length >=
584  if (language_model_debug_level > 1) {
585  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
586  }
587  return false;
588  }
589 
590  // Invoke Dawg language model component.
591  LanguageModelDawgInfo *dawg_info =
592  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
593 
594  float outline_length =
596  // Invoke Ngram language model component.
597  LanguageModelNgramInfo *ngram_info = NULL;
599  ngram_info = GenerateNgramInfo(
601  denom, curr_col, curr_row, outline_length, parent_vse);
602  ASSERT_HOST(ngram_info != NULL);
603  }
604  bool liked_by_language_model = dawg_info != NULL ||
605  (ngram_info != NULL && !ngram_info->pruned);
606  // Quick escape if not liked by the language model, can't be consistent
607  // xheight, and not top choice.
608  if (!liked_by_language_model && top_choice_flags == 0) {
609  if (language_model_debug_level > 1) {
610  tprintf("Language model components very early pruned this entry\n");
611  }
612  delete ngram_info;
613  delete dawg_info;
614  return false;
615  }
616 
617  // Check consistency of the path and set the relevant consistency_info.
618  LMConsistencyInfo consistency_info(
619  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
620  // Start with just the x-height consistency, as it provides significant
621  // pruning opportunity.
622  consistency_info.ComputeXheightConsistency(
624  // Turn off xheight consistent flag if not consistent.
625  if (consistency_info.InconsistentXHeight()) {
626  top_choice_flags &= ~kXhtConsistentFlag;
627  }
628 
629  // Quick escape if not liked by the language model, not consistent xheight,
630  // and not top choice.
631  if (!liked_by_language_model && top_choice_flags == 0) {
632  if (language_model_debug_level > 1) {
633  tprintf("Language model components early pruned this entry\n");
634  }
635  delete ngram_info;
636  delete dawg_info;
637  return false;
638  }
639 
640  // Compute the rest of the consistency info.
641  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
642  word_res, &consistency_info);
643  if (dawg_info != NULL && consistency_info.invalid_punc) {
644  consistency_info.invalid_punc = false; // do not penalize dict words
645  }
646 
647  // Compute cost of associating the blobs that represent the current unichar.
648  AssociateStats associate_stats;
649  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
650  parent_vse, word_res, &associate_stats);
651  if (parent_vse != NULL) {
652  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
653  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
654  }
655 
656  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
657  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
658  parent_vse, b, 0.0, outline_length,
659  consistency_info, associate_stats, top_choice_flags, dawg_info,
660  ngram_info, (language_model_debug_level > 0) ?
661  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
662  new_vse->cost = ComputeAdjustedPathCost(new_vse);
664  tprintf("Adjusted cost = %g\n", new_vse->cost);
665 
666  // Invoke Top Choice language model component to make the final adjustments
667  // to new_vse->top_choice_flags.
668  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
669  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
670  }
671 
672  // If language model components did not like this unichar - return.
673  bool keep = new_vse->top_choice_flags || liked_by_language_model;
674  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
675  consistency_info.inconsistent_script) { // with inconsistent script
676  keep = false;
677  }
678  if (!keep) {
679  if (language_model_debug_level > 1) {
680  tprintf("Language model components did not like this entry\n");
681  }
682  delete new_vse;
683  return false;
684  }
685 
686  // Discard this entry if it represents a prunable path and
687  // language_model_viterbi_list_max_num_prunable such entries with a lower
688  // cost have already been recorded.
689  if (PrunablePath(*new_vse) &&
690  (curr_state->viterbi_state_entries_prunable_length >=
692  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
693  if (language_model_debug_level > 1) {
694  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
695  new_vse->cost,
696  curr_state->viterbi_state_entries_prunable_max_cost);
697  }
698  delete new_vse;
699  return false;
700  }
701 
702  // Update best choice if needed.
703  if (word_end) {
704  UpdateBestChoice(new_vse, pain_points, word_res,
705  best_choice_bundle, blamer_bundle);
706  // Discard the entry if UpdateBestChoice() found flaws in it.
707  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
708  new_vse != best_choice_bundle->best_vse) {
709  if (language_model_debug_level > 1) {
710  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
711  }
712  delete new_vse;
713  return false;
714  }
715  }
716 
717  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
718  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
719  false, new_vse);
720  curr_state->viterbi_state_entries_length++;
721  if (PrunablePath(*new_vse)) {
722  curr_state->viterbi_state_entries_prunable_length++;
723  }
724 
725  // Update lms->viterbi_state_entries_prunable_max_cost and clear
726  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
727  if ((curr_state->viterbi_state_entries_prunable_length >=
729  new_vse->top_choice_flags) {
730  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
731  int prunable_counter = language_model_viterbi_list_max_num_prunable;
732  vit.set_to_list(&(curr_state->viterbi_state_entries));
733  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
734  ViterbiStateEntry *curr_vse = vit.data();
735  // Clear the appropriate top choice flags of the entries in the
736  // list that have cost higher thank new_entry->cost
737  // (since they will not be top choices any more).
738  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
739  curr_vse->cost > new_vse->cost) {
740  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
741  }
742  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
743  // Update curr_state->viterbi_state_entries_prunable_max_cost.
744  if (prunable_counter == 0) {
745  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
746  if (language_model_debug_level > 1) {
747  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
748  curr_state->viterbi_state_entries_prunable_max_cost);
749  }
750  prunable_counter = -1; // stop counting
751  }
752  }
753  }
754 
755  // Print the newly created ViterbiStateEntry.
756  if (language_model_debug_level > 2) {
757  new_vse->Print("New");
759  curr_state->Print("Updated viterbi list");
760  }
761 
762  return true;
763 }
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int language_model_viterbi_list_max_num_prunable
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:126
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
float rating() const
Definition: ratngs.h:79
static const LanguageModelFlagsType kXhtConsistentFlag
#define tprintf(...)
Definition: tprintf.h:31
float certainty() const
Definition: ratngs.h:82
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
static const float kBadRating
Definition: ratngs.h:273
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 104 of file language_model.h.

104  {
106  // cert is assumed to be between 0 and -dict_->certainty_scale.
107  // If you enable language_model_use_sigmoidal_certainty, you
108  // need to adjust language_model_ngram_nonmatch_score as well.
109  cert = -cert / dict_->certainty_scale;
110  return 1.0f / (1.0f + exp(10.0f * cert));
111  } else {
112  return (-1.0f / cert);
113  }
114  }
double certainty_scale
Definition: dict.h:609

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1192 of file language_model.cpp.

1192  {
1193  ASSERT_HOST(vse != NULL);
1194  if (params_model_.Initialized()) {
1195  float features[PTRAIN_NUM_FEATURE_TYPES];
1196  ExtractFeaturesFromPath(*vse, features);
1197  float cost = params_model_.ComputeCost(features);
1198  if (language_model_debug_level > 3) {
1199  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1200  if (language_model_debug_level >= 5) {
1201  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1202  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1203  }
1204  }
1205  }
1206  return cost * vse->outline_length;
1207  } else {
1208  float adjustment = 1.0f;
1209  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1211  }
1212  if (vse->dawg_info == NULL) {
1214  if (vse->length > language_model_min_compound_length) {
1215  adjustment += ((vse->length - language_model_min_compound_length) *
1217  }
1218  }
1219  if (vse->associate_stats.shape_cost > 0) {
1220  adjustment += vse->associate_stats.shape_cost /
1221  static_cast<float>(vse->length);
1222  }
1224  ASSERT_HOST(vse->ngram_info != NULL);
1225  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1226  } else {
1227  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1228  vse->consistency_info);
1229  return vse->ratings_sum * adjustment;
1230  }
1231  }
1232 }
float ComputeCost(const float features[]) const
double language_model_penalty_non_dict_word
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
double language_model_penalty_non_freq_dict_word
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 116 of file language_model.h.

116  {
117  if (num_problems == 0) return 0.0f;
118  if (num_problems == 1) return penalty;
119  return (penalty + (language_model_penalty_increment *
120  static_cast<float>(num_problems-1)));
121  }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 272 of file language_model.h.

276  {
278  col, row,
279  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
280  (parent_vse != NULL) ? parent_vse->length : 0,
281  fixed_pitch_, max_char_wh_ratio,
282  word_res, language_model_debug_level > 2, associate_stats);
283  }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:37

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 127 of file language_model.h.

129  {
130  if (dawg_info != NULL) {
131  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
133  (consistency_info.inconsistent_script ?
135  }
136  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
138  ComputeAdjustment(consistency_info.NumInconsistentCase(),
140  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
142  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
144  (consistency_info.inconsistent_script ?
146  (consistency_info.inconsistent_font ?
148  }
float ComputeAdjustment(int num_problems, float penalty)

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 989 of file language_model.cpp.

989  {
990  if (curr_list->empty()) return 1.0f;
991  float denom = 0.0f;
992  int len = 0;
993  BLOB_CHOICE_IT c_it(curr_list);
994  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
995  ASSERT_HOST(c_it.data() != NULL);
996  ++len;
997  denom += CertaintyScore(c_it.data()->certainty());
998  }
999  assert(len != 0);
1000  // The ideal situation would be to have the classifier scores for
1001  // classifying each position as each of the characters in the unicharset.
1002  // Since we can not do this because of speed, we add a very crude estimate
1003  // of what these scores for the "missing" classifications would sum up to.
1004  denom += (dict_->getUnicharset().size() - len) *
1006 
1007  return denom;
1008 }
int size() const
Definition: unicharset.h:297
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double language_model_ngram_nonmatch_score
float CertaintyScore(float cert)
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 929 of file language_model.cpp.

935  {
936  const char *context_ptr = context;
937  char *modified_context = NULL;
938  char *modified_context_end = NULL;
939  const char *unichar_ptr = unichar;
940  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
941  float prob = 0.0f;
942  int step = 0;
943  while (unichar_ptr < unichar_end &&
944  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
945  if (language_model_debug_level > 1) {
946  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
947  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
948  }
949  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
950  ++(*unichar_step_len);
952  unichar_ptr += step;
953  // If there are multiple UTF8 characters present in unichar, context is
954  // updated to include the previously examined characters from str,
955  // unless use_only_first_uft8_step is true.
956  if (unichar_ptr < unichar_end) {
957  if (modified_context == NULL) {
958  int context_len = strlen(context);
959  modified_context =
960  new char[context_len + strlen(unichar_ptr) + step + 1];
961  strncpy(modified_context, context, context_len);
962  modified_context_end = modified_context + context_len;
963  context_ptr = modified_context;
964  }
965  strncpy(modified_context_end, unichar_ptr - step, step);
966  modified_context_end += step;
967  *modified_context_end = '\0';
968  }
969  }
970  prob /= static_cast<float>(*unichar_step_len); // normalize
971  if (prob < language_model_ngram_small_prob) {
972  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
973  *found_small_prob = true;
975  }
976  *ngram_cost = -1.0*log2(prob);
977  float ngram_and_classifier_cost =
978  -1.0*log2(CertaintyScore(certainty)/denom) +
979  *ngram_cost * language_model_ngram_scale_factor;
980  if (language_model_debug_level > 1) {
981  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
982  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
983  ngram_and_classifier_cost);
984  }
985  delete[] modified_context;
986  return ngram_and_classifier_cost;
987 }
bool language_model_ngram_use_only_first_uft8_step
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:370
#define tprintf(...)
Definition: tprintf.h:31
float CertaintyScore(float cert)
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1383 of file language_model.cpp.

1388  {
1389  if (truth_path != NULL) {
1390  *truth_path =
1391  (blamer_bundle != NULL &&
1392  vse->length == blamer_bundle->correct_segmentation_length());
1393  }
1394  BLOB_CHOICE *curr_b = vse->curr_b;
1395  ViterbiStateEntry *curr_vse = vse;
1396 
1397  int i;
1398  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1399 
1400  // Re-compute the variance of the width-to-height ratios (since we now
1401  // can compute the mean over the whole word).
1402  float full_wh_ratio_mean = 0.0f;
1403  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1404  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1405  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1406  static_cast<float>(vse->length));
1407  vse->associate_stats.full_wh_ratio_var = 0.0f;
1408  }
1409 
1410  // Construct a WERD_CHOICE by tracing parent pointers.
1411  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1412  word->set_length(vse->length);
1413  int total_blobs = 0;
1414  for (i = (vse->length-1); i >= 0; --i) {
1415  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1416  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1417  *truth_path = false;
1418  }
1419  // The number of blobs used for this choice is row - col + 1.
1420  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1421  total_blobs += num_blobs;
1422  word->set_blob_choice(i, num_blobs, curr_b);
1423  // Update the width-to-height ratio variance. Useful non-space delimited
1424  // languages to ensure that the blobs are of uniform width.
1425  // Skip leading and trailing punctuation when computing the variance.
1426  if ((full_wh_ratio_mean != 0.0f &&
1427  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1428  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1429  vse->associate_stats.full_wh_ratio_var +=
1430  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1431  if (language_model_debug_level > 2) {
1432  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1433  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1434  }
1435  }
1436 
1437  // Mark the word as compound if compound permuter was set for any of
1438  // the unichars on the path (usually this will happen for unichars
1439  // that are compounding operators, like "-" and "/").
1440  if (!compound && curr_vse->dawg_info &&
1441  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1442 
1443  // Update curr_* pointers.
1444  curr_vse = curr_vse->parent_vse;
1445  if (curr_vse == NULL) break;
1446  curr_b = curr_vse->curr_b;
1447  }
1448  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1449  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1450  // Re-adjust shape cost to include the updated width-to-height variance.
1451  if (full_wh_ratio_mean != 0.0f) {
1452  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1453  }
1454 
1455  word->set_rating(vse->ratings_sum);
1456  word->set_certainty(vse->min_certainty);
1457  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1458  vse->consistency_info.BodyMaxXHeight());
1459  if (vse->dawg_info != NULL) {
1460  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1461  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1462  word->set_permuter(NGRAM_PERM);
1463  } else if (vse->top_choice_flags) {
1465  } else {
1466  word->set_permuter(NO_PERM);
1467  }
1468  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1469  word_res->ratings));
1470  return word;
1471 }
void set_length(int len)
Definition: ratngs.h:379
int dimension() const
Definition: matrix.h:530
void set_rating(float new_val)
Definition: ratngs.h:367
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:364
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void set_certainty(float new_val)
Definition: ratngs.h:370
MATRIX * ratings
Definition: pageres.h:215
#define tprintf(...)
Definition: tprintf.h:31
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
const UNICHARSET * uch_set
Definition: pageres.h:192
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
bool TESS_API NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:151
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
int correct_segmentation_length() const
Definition: blamer.h:126
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:340

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1334 of file language_model.cpp.

1335  {
1336  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1337  // Record dictionary match info.
1338  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1339  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1340  if (vse.dawg_info != NULL) {
1341  int permuter = vse.dawg_info->permuter;
1342  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1343  if (vse.consistency_info.num_digits == vse.length) {
1344  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1345  } else {
1346  features[PTRAIN_NUM_SHORT+len] = 1.0;
1347  }
1348  } else if (permuter == DOC_DAWG_PERM) {
1349  features[PTRAIN_DOC_SHORT+len] = 1.0;
1350  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1351  permuter == COMPOUND_PERM) {
1352  features[PTRAIN_DICT_SHORT+len] = 1.0;
1353  } else if (permuter == FREQ_DAWG_PERM) {
1354  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1355  }
1356  }
1357  // Record shape cost feature (normalized by path length).
1358  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1359  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1360  // Record ngram cost. (normalized by the path length).
1361  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1362  if (vse.ngram_info != NULL) {
1363  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1364  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1365  }
1366  // Record consistency-related features.
1367  // Disabled this feature for due to its poor performance.
1368  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1369  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1370  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1371  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1372  vse.consistency_info.NumInconsistentChartype() : 0.0;
1373  features[PTRAIN_NUM_BAD_SPACING] =
1374  vse.consistency_info.NumInconsistentSpaces();
1375  // Disabled this feature for now due to its poor performance.
1376  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1377 
1378  // Classifier-related features.
1379  features[PTRAIN_RATING_PER_CHAR] =
1380  vse.ratings_sum / static_cast<float>(vse.outline_length);
1381 }

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1010 of file language_model.cpp.

1016  {
1017  const UNICHARSET &unicharset = dict_->getUnicharset();
1018  UNICHAR_ID unichar_id = b->unichar_id();
1019  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1020 
1021  // Check punctuation validity.
1022  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1023  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1024  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1025  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1026  unicharset.get_isdigit(parent_b->unichar_id()))) {
1027  // reset punc_ref for compound words
1028  consistency_info->punc_ref = NO_EDGE;
1029  } else {
1030  bool is_apos = dict_->is_apostrophe(unichar_id);
1031  bool prev_is_numalpha = (parent_b != NULL &&
1032  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1033  unicharset.get_isdigit(parent_b->unichar_id())));
1034  UNICHAR_ID pattern_unichar_id =
1035  (unicharset.get_isalpha(unichar_id) ||
1036  unicharset.get_isdigit(unichar_id) ||
1037  (is_apos && prev_is_numalpha)) ?
1038  Dawg::kPatternUnicharID : unichar_id;
1039  if (consistency_info->punc_ref == NO_EDGE ||
1040  pattern_unichar_id != Dawg::kPatternUnicharID ||
1041  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1044  consistency_info->punc_ref);
1045  consistency_info->punc_ref =
1046  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1047  node, pattern_unichar_id, word_end) : NO_EDGE;
1048  if (consistency_info->punc_ref == NO_EDGE) {
1049  consistency_info->invalid_punc = true;
1050  }
1051  }
1052  }
1053  }
1054 
1055  // Update case related counters.
1056  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1057  // Reset counters if we are dealing with a compound word.
1058  consistency_info->num_lower = 0;
1059  consistency_info->num_non_first_upper = 0;
1060  }
1061  else if (unicharset.get_islower(unichar_id)) {
1062  consistency_info->num_lower++;
1063  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1064  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1065  consistency_info->num_lower > 0 ||
1066  consistency_info->num_non_first_upper > 0) {
1067  consistency_info->num_non_first_upper++;
1068  }
1069  }
1070 
1071  // Initialize consistency_info->script_id (use script of unichar_id
1072  // if it is not Common, use script id recorded by the parent otherwise).
1073  // Set inconsistent_script to true if the script of the current unichar
1074  // is not consistent with that of the parent.
1075  consistency_info->script_id = unicharset.get_script(unichar_id);
1076  // Hiragana and Katakana can mix with Han.
1078  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1079  consistency_info->script_id == unicharset.hiragana_sid()) ||
1080  (unicharset.katakana_sid() != unicharset.null_sid() &&
1081  consistency_info->script_id == unicharset.katakana_sid())) {
1082  consistency_info->script_id = dict_->getUnicharset().han_sid();
1083  }
1084  }
1085 
1086  if (parent_vse != NULL &&
1087  (parent_vse->consistency_info.script_id !=
1088  dict_->getUnicharset().common_sid())) {
1089  int parent_script_id = parent_vse->consistency_info.script_id;
1090  // If script_id is Common, use script id of the parent instead.
1091  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1092  consistency_info->script_id = parent_script_id;
1093  }
1094  if (consistency_info->script_id != parent_script_id) {
1095  consistency_info->inconsistent_script = true;
1096  }
1097  }
1098 
1099  // Update chartype related counters.
1100  if (unicharset.get_isalpha(unichar_id)) {
1101  consistency_info->num_alphas++;
1102  } else if (unicharset.get_isdigit(unichar_id)) {
1103  consistency_info->num_digits++;
1104  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1105  consistency_info->num_other++;
1106  }
1107 
1108  // Check font and spacing consistency.
1109  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1110  int fontinfo_id = -1;
1111  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1112  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1113  fontinfo_id = b->fontinfo_id();
1114  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1115  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1116  fontinfo_id = b->fontinfo_id2();
1117  }
1118  if(language_model_debug_level > 1) {
1119  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1120  (parent_b->fontinfo_id() >= 0) ?
1121  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1122  (parent_b->fontinfo_id2() >= 0) ?
1123  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1124  (b->fontinfo_id() >= 0) ?
1125  fontinfo_table_->get(b->fontinfo_id()).name : "",
1126  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1127  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1128  fontinfo_id);
1129  }
1130  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1131  bool expected_gap_found = false;
1132  float expected_gap;
1133  int temp_gap;
1134  if (fontinfo_id >= 0) { // found a common font
1135  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1136  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1137  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1138  expected_gap = temp_gap;
1139  expected_gap_found = true;
1140  }
1141  } else {
1142  consistency_info->inconsistent_font = true;
1143  // Get an average of the expected gaps in each font
1144  int num_addends = 0;
1145  expected_gap = 0;
1146  int temp_fid;
1147  for (int i = 0; i < 4; ++i) {
1148  if (i == 0) {
1149  temp_fid = parent_b->fontinfo_id();
1150  } else if (i == 1) {
1151  temp_fid = parent_b->fontinfo_id2();
1152  } else if (i == 2) {
1153  temp_fid = b->fontinfo_id();
1154  } else {
1155  temp_fid = b->fontinfo_id2();
1156  }
1157  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1158  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1159  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1160  expected_gap += temp_gap;
1161  num_addends++;
1162  }
1163  }
1164  expected_gap_found = (num_addends > 0);
1165  if (num_addends > 0) {
1166  expected_gap /= static_cast<float>(num_addends);
1167  }
1168  }
1169  if (expected_gap_found) {
1170  float actual_gap =
1171  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1172  float gap_ratio = expected_gap / actual_gap;
1173  // TODO(rays) The gaps seem to be way off most of the time, saved by
1174  // the error here that the ratio was compared to 1/2, when it should
1175  // have been 0.5f. Find the source of the gaps discrepancy and put
1176  // the 0.5f here in place of 0.0f.
1177  // Test on 2476595.sj, pages 0 to 6. (In French.)
1178  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1179  consistency_info->num_inconsistent_spaces++;
1180  }
1181  if (language_model_debug_level > 1) {
1182  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1183  unicharset.id_to_unichar(parent_b->unichar_id()),
1184  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1185  unichar_id, curr_col, expected_gap, actual_gap);
1186  }
1187  }
1188  }
1189  }
1190 }
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:117
inT64 NODE_REF
Definition: dawg.h:55
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int han_sid() const
Definition: unicharset.h:836
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:418
const UnicityTable< FontInfo > * fontinfo_table_
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
int hiragana_sid() const
Definition: unicharset.h:837
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
#define tprintf(...)
Definition: tprintf.h:31
int common_sid() const
Definition: unicharset.h:832
inT16 fontinfo_id() const
Definition: ratngs.h:85
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
GenericVector< int > blob_widths
Definition: pageres.h:205
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
int null_sid() const
Definition: unicharset.h:831
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool empty() const
Definition: genericvector.h:84
#define ASSERT_HOST(x)
Definition: errcode.h:84
int katakana_sid() const
Definition: unicharset.h:838
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:414
int UNICHAR_ID
Definition: unichar.h:33
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 781 of file language_model.cpp.

785  {
786  // Initialize active_dawgs from parent_vse if it is not NULL.
787  // Otherwise use very_beginning_active_dawgs_.
788  if (parent_vse == NULL) {
791  } else {
792  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
793  dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
794  dawg_args_.permuter = parent_vse->dawg_info->permuter;
795  }
796 
797  // Deal with hyphenated words.
798  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
799  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
800  return new LanguageModelDawgInfo(dawg_args_.active_dawgs,
801  COMPOUND_PERM);
802  }
803 
804  // Deal with compound words.
805  if (dict_->compound_marker(b.unichar_id()) &&
806  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
807  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
808  // Do not allow compound operators at the beginning and end of the word.
809  // Do not allow more than one compound operator per word.
810  // Do not allow compounding of words with lengths shorter than
811  // language_model_min_compound_length
812  if (parent_vse == NULL || word_end ||
814  parent_vse->length < language_model_min_compound_length) return NULL;
815 
816  int i;
817  // Check a that the path terminated before the current character is a word.
818  bool has_word_ending = false;
819  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
820  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
821  const Dawg *pdawg = pos.dawg_index < 0
822  ? NULL : dict_->GetDawg(pos.dawg_index);
823  if (pdawg == NULL || pos.back_to_punc) continue;;
824  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
825  pdawg->end_of_word(pos.dawg_ref)) {
826  has_word_ending = true;
827  break;
828  }
829  }
830  if (!has_word_ending) return NULL;
831 
832  if (language_model_debug_level > 0) tprintf("Compound word found\n");
833  return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
834  } // done dealing with compound words
835 
836  LanguageModelDawgInfo *dawg_info = NULL;
837 
838  // Call LetterIsOkay().
839  // Use the normalized IDs so that all shapes of ' can be allowed in words
840  // like don't.
841  const GenericVector<UNICHAR_ID>& normed_ids =
843  DawgPositionVector tmp_active_dawgs;
844  for (int i = 0; i < normed_ids.size(); ++i) {
846  tprintf("Test Letter OK for unichar %d, normed %d\n",
847  b.unichar_id(), normed_ids[i]);
848  dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
849  word_end && i == normed_ids.size() - 1);
850  if (dawg_args_.permuter == NO_PERM) {
851  break;
852  } else if (i < normed_ids.size() - 1) {
853  tmp_active_dawgs = *dawg_args_.updated_dawgs;
854  dawg_args_.active_dawgs = &tmp_active_dawgs;
855  }
857  tprintf("Letter was OK for unichar %d, normed %d\n",
858  b.unichar_id(), normed_ids[i]);
859  }
860  dawg_args_.active_dawgs = NULL;
861  if (dawg_args_.permuter != NO_PERM) {
862  dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
864  } else if (language_model_debug_level > 3) {
865  tprintf("Letter %s not OK!\n",
867  }
868 
869  return dawg_info;
870 }
DawgPositionVector beginning_active_dawgs_
DawgPositionVector * updated_dawgs
Definition: dict.h:81
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:412
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:357
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
DawgPositionVector very_beginning_active_dawgs_
DawgPositionVector * active_dawgs
Definition: dict.h:80
PermuterType permuter
Definition: dict.h:82
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 872 of file language_model.cpp.

875  {
876  // Initialize parent context.
877  const char *pcontext_ptr = "";
878  int pcontext_unichar_step_len = 0;
879  if (parent_vse == NULL) {
880  pcontext_ptr = prev_word_str_.string();
881  pcontext_unichar_step_len = prev_word_unichar_step_len_;
882  } else {
883  pcontext_ptr = parent_vse->ngram_info->context.string();
884  pcontext_unichar_step_len =
885  parent_vse->ngram_info->context_unichar_step_len;
886  }
887  // Compute p(unichar | parent context).
888  int unichar_step_len = 0;
889  bool pruned = false;
890  float ngram_cost;
891  float ngram_and_classifier_cost =
892  ComputeNgramCost(unichar, certainty, denom,
893  pcontext_ptr, &unichar_step_len,
894  &pruned, &ngram_cost);
895  // Normalize just the ngram_and_classifier_cost by outline_length.
896  // The ngram_cost is used by the params_model, so it needs to be left as-is,
897  // and the params model cost will be normalized by outline_length.
898  ngram_and_classifier_cost *=
899  outline_length / language_model_ngram_rating_factor;
900  // Add the ngram_cost of the parent.
901  if (parent_vse != NULL) {
902  ngram_and_classifier_cost +=
903  parent_vse->ngram_info->ngram_and_classifier_cost;
904  ngram_cost += parent_vse->ngram_info->ngram_cost;
905  }
906 
907  // Shorten parent context string by unichar_step_len unichars.
908  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
910  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
911  while (num_remove > 0 && *pcontext_ptr != '\0') {
912  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
913  --num_remove;
914  }
915 
916  // Decide whether to prune this ngram path and update changed accordingly.
917  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
918 
919  // Construct and return the new LanguageModelNgramInfo.
920  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
921  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
922  ngram_and_classifier_cost);
923  ngram_info->context += unichar;
924  ngram_info->context_unichar_step_len += unichar_step_len;
925  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
926  return ngram_info;
927 }
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
const char * string() const
Definition: strngs.cpp:201
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 765 of file language_model.cpp.

767  {
768  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
769  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
770  new_vse->cost >= vit.data()->cost; vit.forward()) {
771  // Clear the appropriate flags if the list already contains
772  // a top choice entry with a lower cost.
773  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
774  }
775  if (language_model_debug_level > 2) {
776  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
777  new_vse->top_choice_flags);
778  }
779 }
#define tprintf(...)
Definition: tprintf.h:31

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 496 of file language_model.cpp.

500  {
501  for (; !vse_it->cycled_list(); vse_it->forward()) {
502  ViterbiStateEntry* parent_vse = vse_it->data();
503  // Only consider the parent if it has been updated or
504  // if the current ratings cell has just been classified.
505  if (!just_classified && !parent_vse->updated) continue;
507  parent_vse->Print("Considering");
508  // If the parent is non-alnum, then upper counts as lower.
509  *top_choice_flags = blob_choice_flags;
510  if ((blob_choice_flags & kUpperCaseFlag) &&
511  !parent_vse->HasAlnumChoice(unicharset)) {
512  *top_choice_flags |= kLowerCaseFlag;
513  }
514  *top_choice_flags &= parent_vse->top_choice_flags;
515  UNICHAR_ID unichar_id = bc->unichar_id();
516  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
517  UNICHAR_ID parent_id = parent_b->unichar_id();
518  // Digits do not bind to alphas if there is a mix in both parent and current
519  // or if the alpha is not the top choice.
520  if (unicharset.get_isdigit(unichar_id) &&
521  unicharset.get_isalpha(parent_id) &&
522  (mixed_alnum || *top_choice_flags == 0))
523  continue; // Digits don't bind to alphas.
524  // Likewise alphas do not bind to digits if there is a mix in both or if
525  // the digit is not the top choice.
526  if (unicharset.get_isalpha(unichar_id) &&
527  unicharset.get_isdigit(parent_id) &&
528  (mixed_alnum || *top_choice_flags == 0))
529  continue; // Alphas don't bind to digits.
530  // If there is a case mix of the same alpha in the parent list, then
531  // competing_vse is non-null and will be used to determine whether
532  // or not to bind the current blob choice.
533  if (parent_vse->competing_vse != NULL) {
534  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
535  UNICHAR_ID other_id = competing_b->unichar_id();
536  if (language_model_debug_level >= 5) {
537  tprintf("Parent %s has competition %s\n",
538  unicharset.id_to_unichar(parent_id),
539  unicharset.id_to_unichar(other_id));
540  }
541  if (unicharset.SizesDistinct(parent_id, other_id)) {
542  // If other_id matches bc wrt position and size, and parent_id, doesn't,
543  // don't bind to the current parent.
544  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
546  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
548  continue; // Competing blobchoice has a better vertical match.
549  }
550  }
551  vse_it->forward();
552  return parent_vse; // This one is good!
553  }
554  return NULL; // Ran out of possibilities.
555 }
static const LanguageModelFlagsType kUpperCaseFlag
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
float x_height
Definition: pageres.h:295
static const LanguageModelFlagsType kLowerCaseFlag
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
#define tprintf(...)
Definition: tprintf.h:31
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
int UNICHAR_ID
Definition: unichar.h:33

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 100 of file language_model.h.

100 { return params_model_; }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 379 of file language_model.cpp.

382  {
383  BLOB_CHOICE_IT c_it(curr_list);
384  const UNICHARSET &unicharset = dict_->getUnicharset();
385  BLOB_CHOICE *first_unichar = NULL;
386  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
387  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
388  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
389  if (first_unichar == NULL) first_unichar = c_it.data();
390  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
391  *first_lower = c_it.data();
392  }
393  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
394  !unicharset.get_islower(unichar_id)) {
395  *first_upper = c_it.data();
396  }
397  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
398  *first_digit = c_it.data();
399  }
400  }
401  ASSERT_HOST(first_unichar != NULL);
402  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
403  *first_digit != NULL;
404  if (*first_lower == NULL) *first_lower = first_unichar;
405  if (*first_upper == NULL) *first_upper = first_unichar;
406  if (*first_digit == NULL) *first_digit = first_unichar;
407  return mixed;
408 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
Definition: cluster.h:45
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
#define ASSERT_HOST(x)
Definition: errcode.h:84
int UNICHAR_ID
Definition: unichar.h:33

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 132 of file language_model.cpp.

134  {
135  fixed_pitch_ = fixed_pitch;
136  max_char_wh_ratio_ = max_char_wh_ratio;
137  rating_cert_scale_ = rating_cert_scale;
138  acceptable_choice_found_ = false;
140 
141  // Initialize vectors with beginning DawgInfos.
146 
147  // Fill prev_word_str_ with the last language_model_ngram_order
148  // unichars from prev_word.
150  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
151  prev_word_str_ = prev_word->unichar_string();
153  } else {
154  prev_word_str_ = " ";
155  }
156  const char *str_ptr = prev_word_str_.string();
157  const char *str_end = str_ptr + prev_word_str_.length();
158  int step;
160  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
161  str_ptr += step;
163  }
164  ASSERT_HOST(str_ptr == str_end);
165  }
166 }
DawgPositionVector beginning_active_dawgs_
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:565
const STRING & unichar_string() const
Definition: ratngs.h:525
inT32 length() const
Definition: strngs.cpp:196
bool language_model_ngram_space_delimited_language
const char * string() const
Definition: strngs.cpp:201
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:548
DawgPositionVector very_beginning_active_dawgs_
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 291 of file language_model.h.

291  {
292  if (vse.top_choice_flags) return false;
293  if (vse.dawg_info != NULL &&
294  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
295  vse.dawg_info->permuter == USER_DAWG_PERM ||
296  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
297  return true;
298  }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 96 of file language_model.h.

96  {
98  }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 419 of file language_model.cpp.

420  {
421  if (parent_node == NULL) return -1;
422  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
423  ViterbiStateEntry* top_lower = NULL;
424  ViterbiStateEntry* top_upper = NULL;
425  ViterbiStateEntry* top_digit = NULL;
426  ViterbiStateEntry* top_choice = NULL;
427  float lower_rating = 0.0f;
428  float upper_rating = 0.0f;
429  float digit_rating = 0.0f;
430  float top_rating = 0.0f;
431  const UNICHARSET &unicharset = dict_->getUnicharset();
432  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
433  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
434  ViterbiStateEntry* vse = vit.data();
435  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
436  // back to the real character if needed.
437  ViterbiStateEntry* unichar_vse = vse;
438  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
439  float rating = unichar_vse->curr_b->rating();
440  while (unichar_id == INVALID_UNICHAR_ID &&
441  unichar_vse->parent_vse != NULL) {
442  unichar_vse = unichar_vse->parent_vse;
443  unichar_id = unichar_vse->curr_b->unichar_id();
444  rating = unichar_vse->curr_b->rating();
445  }
446  if (unichar_id != INVALID_UNICHAR_ID) {
447  if (unicharset.get_islower(unichar_id)) {
448  if (top_lower == NULL || lower_rating > rating) {
449  top_lower = vse;
450  lower_rating = rating;
451  }
452  } else if (unicharset.get_isalpha(unichar_id)) {
453  if (top_upper == NULL || upper_rating > rating) {
454  top_upper = vse;
455  upper_rating = rating;
456  }
457  } else if (unicharset.get_isdigit(unichar_id)) {
458  if (top_digit == NULL || digit_rating > rating) {
459  top_digit = vse;
460  digit_rating = rating;
461  }
462  }
463  }
464  if (top_choice == NULL || top_rating > rating) {
465  top_choice = vse;
466  top_rating = rating;
467  top_id = unichar_id;
468  }
469  }
470  if (top_choice == NULL) return -1;
471  bool mixed = (top_lower != NULL || top_upper != NULL) &&
472  top_digit != NULL;
473  if (top_lower == NULL) top_lower = top_choice;
474  top_lower->top_choice_flags |= kLowerCaseFlag;
475  if (top_upper == NULL) top_upper = top_choice;
476  top_upper->top_choice_flags |= kUpperCaseFlag;
477  if (top_digit == NULL) top_digit = top_choice;
478  top_digit->top_choice_flags |= kDigitFlag;
479  top_choice->top_choice_flags |= kSmallestRatingFlag;
480  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
481  (top_choice->top_choice_flags &
483  // If the compound marker top choice carries any of the top alnum flags,
484  // then give it all of them, allowing words like I-295 to be chosen.
485  top_choice->top_choice_flags |=
487  }
488  return mixed ? 1 : 0;
489 }
static const LanguageModelFlagsType kUpperCaseFlag
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kDigitFlag
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
static const LanguageModelFlagsType kLowerCaseFlag
Definition: cluster.h:45
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
int UNICHAR_ID
Definition: unichar.h:33
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1234 of file language_model.cpp.

1239  {
1240  bool truth_path;
1241  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1242  blamer_bundle, &truth_path);
1243  ASSERT_HOST(word != NULL);
1244  if (dict_->stopper_debug_level >= 1) {
1245  STRING word_str;
1246  word->string_and_lengths(&word_str, NULL);
1247  vse->Print(word_str.string());
1248  }
1249  if (language_model_debug_level > 0) {
1250  word->print("UpdateBestChoice() constructed word");
1251  }
1252  // Record features from the current path if necessary.
1253  ParamsTrainingHypothesis curr_hyp;
1254  if (blamer_bundle != NULL) {
1255  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1256  static_cast<PermuterType>(word->permuter());
1257  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1258  word->string_and_lengths(&(curr_hyp.str), NULL);
1259  curr_hyp.cost = vse->cost; // record cost for error rate computations
1260  if (language_model_debug_level > 0) {
1261  tprintf("Raw features extracted from %s (cost=%g) [ ",
1262  curr_hyp.str.string(), curr_hyp.cost);
1263  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1264  tprintf("%g ", curr_hyp.features[deb_i]);
1265  }
1266  tprintf("]\n");
1267  }
1268  // Record the current hypothesis in params_training_bundle.
1269  blamer_bundle->AddHypothesis(curr_hyp);
1270  if (truth_path)
1271  blamer_bundle->UpdateBestRating(word->rating());
1272  }
1273  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1274  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1275  // we no longer need it.
1276  delete word;
1277  return;
1278  }
1279  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1280  word->SetScriptPositions(false, word_res->chopped_word);
1281  // Update and log new raw_choice if needed.
1282  if (word_res->raw_choice == NULL ||
1283  word->rating() < word_res->raw_choice->rating()) {
1284  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1285  tprintf("Updated raw choice\n");
1286  }
1287  // Set the modified rating for best choice to vse->cost and log best choice.
1288  word->set_rating(vse->cost);
1289  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1290  // computes adjust_factor that is used by the adaption code (e.g. by
1291  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1292  // Note: the rating of the word is not adjusted.
1293  dict_->adjust_word(word, vse->dawg_info == NULL,
1294  vse->consistency_info.xht_decision, 0.0,
1295  false, language_model_debug_level > 0);
1296  // Hand ownership of the word over to the word_res.
1298  dict_->stopper_debug_level >= 1, word)) {
1299  // The word was so bad that it was deleted.
1300  return;
1301  }
1302  if (word_res->best_choice == word) {
1303  // Word was the new best.
1304  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1305  AcceptablePath(*vse)) {
1306  acceptable_choice_found_ = true;
1307  }
1308  // Update best_choice_bundle.
1309  best_choice_bundle->updated = true;
1310  best_choice_bundle->best_vse = vse;
1311  if (language_model_debug_level > 0) {
1312  tprintf("Updated best choice\n");
1313  word->print_state("New state ");
1314  }
1315  // Update hyphen state if we are dealing with a dictionary word.
1316  if (vse->dawg_info != NULL) {
1317  if (dict_->has_hyphen_end(*word)) {
1319  } else {
1320  dict_->reset_hyphen_vars(true);
1321  }
1322  }
1323 
1324  if (blamer_bundle != NULL) {
1326  vse->dawg_info != NULL && vse->top_choice_flags);
1327  }
1328  }
1329  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1330  word->DisplaySegmentation(word_res->chopped_word);
1331  }
1332 }
bool AcceptablePath(const ViterbiStateEntry &vse)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void set_rating(float new_val)
Definition: ratngs.h:367
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void print() const
Definition: ratngs.h:564
PermuterType
Definition: ratngs.h:240
int stopper_debug_level
Definition: dict.h:620
void UpdateBestRating(float rating)
Definition: blamer.h:122
uinT8 permuter() const
Definition: ratngs.h:344
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
void print_state(const char *msg) const
Definition: ratngs.cpp:738
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
const char * string() const
Definition: strngs.cpp:201
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
int tessedit_truncate_wordchoice_log
Definition: dict.h:626
TWERD * chopped_word
Definition: pageres.h:201
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:50
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:650
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
float rating() const
Definition: ratngs.h:325
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
DawgPositionVector * active_dawgs
Definition: dict.h:80
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
bool empty() const
Definition: genericvector.h:84
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 249 of file language_model.cpp.

257  {
258  if (language_model_debug_level > 0) {
259  tprintf("\nUpdateState: col=%d row=%d %s",
260  curr_col, curr_row, just_classified ? "just_classified" : "");
262  tprintf("(parent=%p)\n", parent_node);
263  else
264  tprintf("\n");
265  }
266  // Initialize helper variables.
267  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
268  bool new_changed = false;
269  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
270  const UNICHARSET& unicharset = dict_->getUnicharset();
271  BLOB_CHOICE *first_lower = NULL;
272  BLOB_CHOICE *first_upper = NULL;
273  BLOB_CHOICE *first_digit = NULL;
274  bool has_alnum_mix = false;
275  if (parent_node != NULL) {
276  int result = SetTopParentLowerUpperDigit(parent_node);
277  if (result < 0) {
279  tprintf("No parents found to process\n");
280  return false;
281  }
282  if (result > 0)
283  has_alnum_mix = true;
284  }
285  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
286  &first_digit))
287  has_alnum_mix = false;;
288  ScanParentsForCaseMix(unicharset, parent_node);
289  if (language_model_debug_level > 3 && parent_node != NULL) {
290  parent_node->Print("Parent viterbi list");
291  }
292  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
293 
294  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
295  ViterbiStateEntry_IT vit;
296  BLOB_CHOICE_IT c_it(curr_list);
297  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
298  BLOB_CHOICE* choice = c_it.data();
299  // TODO(antonova): make sure commenting this out if ok for ngram
300  // model scoring (I think this was introduced to fix ngram model quirks).
301  // Skip NULL unichars unless it is the only choice.
302  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
303  UNICHAR_ID unichar_id = choice->unichar_id();
304  if (unicharset.get_fragment(unichar_id)) {
305  continue; // Skip fragments.
306  }
307  // Set top choice flags.
308  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
309  if (c_it.at_first() || !new_changed)
310  blob_choice_flags |= kSmallestRatingFlag;
311  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
312  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
313  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
314 
315  if (parent_node == NULL) {
316  // Process the beginning of a word.
317  // If there is a better case variant that is not distinguished by size,
318  // skip this blob choice, as we have no choice but to accept the result
319  // of the character classifier to distinguish between them, even if
320  // followed by an upper case.
321  // With words like iPoc, and other CamelBackWords, the lower-upper
322  // transition can only be achieved if the classifier has the correct case
323  // as the top choice, and leaving an initial I lower down the list
324  // increases the chances of choosing IPoc simply because it doesn't
325  // include such a transition. iPoc will beat iPOC and ipoc because
326  // the other words are baseline/x-height inconsistent.
327  if (HasBetterCaseVariant(unicharset, choice, curr_list))
328  continue;
329  // Upper counts as lower at the beginning of a word.
330  if (blob_choice_flags & kUpperCaseFlag)
331  blob_choice_flags |= kLowerCaseFlag;
332  new_changed |= AddViterbiStateEntry(
333  blob_choice_flags, denom, word_end, curr_col, curr_row,
334  choice, curr_state, NULL, pain_points,
335  word_res, best_choice_bundle, blamer_bundle);
336  } else {
337  // Get viterbi entries from each parent ViterbiStateEntry.
338  vit.set_to_list(&parent_node->viterbi_state_entries);
339  int vit_counter = 0;
340  vit.mark_cycle_pt();
341  ViterbiStateEntry* parent_vse = NULL;
342  LanguageModelFlagsType top_choice_flags;
343  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
344  c_it.data(), blob_choice_flags,
345  unicharset, word_res, &vit,
346  &top_choice_flags)) != NULL) {
347  // Skip pruned entries and do not look at prunable entries if already
348  // examined language_model_viterbi_list_max_num_prunable of those.
349  if (PrunablePath(*parent_vse) &&
351  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
352  continue;
353  }
354  // If the parent has no alnum choice, (ie choice is the first in a
355  // string of alnum), and there is a better case variant that is not
356  // distinguished by size, skip this blob choice/parent, as with the
357  // initial blob treatment above.
358  if (!parent_vse->HasAlnumChoice(unicharset) &&
359  HasBetterCaseVariant(unicharset, choice, curr_list))
360  continue;
361  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
362  // looks good according to the Dawgs or character ngram model.
363  new_changed |= AddViterbiStateEntry(
364  top_choice_flags, denom, word_end, curr_col, curr_row,
365  c_it.data(), curr_state, parent_vse, pain_points,
366  word_res, best_choice_bundle, blamer_bundle);
367  }
368  }
369  }
370  return new_changed;
371 }
static const LanguageModelFlagsType kUpperCaseFlag
int dimension() const
Definition: matrix.h:530
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
static const LanguageModelFlagsType kDigitFlag
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int language_model_viterbi_list_max_num_prunable
static const LanguageModelFlagsType kLowerCaseFlag
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
MATRIX * ratings
Definition: pageres.h:215
static const LanguageModelFlagsType kXhtConsistentFlag
#define tprintf(...)
Definition: tprintf.h:31
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int UNICHAR_ID
Definition: unichar.h:33
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 408 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 396 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 410 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 356 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 375 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 382 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 371 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 48 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 46 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 53 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 45 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 47 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 49 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 308 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 335 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 322 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 310 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 312 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 331 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 328 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 320 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 333 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 325 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 344 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 348 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 350 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 353 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 340 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 338 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 342 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 346 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 352 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 356 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 317 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 385 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 413 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 392 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 393 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 366 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 395 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 354 of file language_model.h.


The documentation for this class was generated from the following files: