tesseract: tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions
	LanguageModel (const UnicityTable< FontInfo > fontinfo_table, Dict dict)

	~LanguageModel ()

void	InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)

bool	UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST curr_list, LanguageModelState parent_node, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle blamer_bundle)

bool	AcceptableChoiceFound ()

void	SetAcceptableChoiceFound (bool val)

ParamsModel &	getParamsModel ()

Static Public Member Functions
static void	ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])

Public Attributes
int	language_model_debug_level = 0

bool	language_model_ngram_on = false

int	language_model_ngram_order = 8

int	language_model_viterbi_list_max_num_prunable = 10

int	language_model_viterbi_list_max_size = 500

double	language_model_ngram_small_prob = 0.000001

double	language_model_ngram_nonmatch_score = -40.0

bool	language_model_ngram_use_only_first_uft8_step = false

double	language_model_ngram_scale_factor = 0.03

double	language_model_ngram_rating_factor = 16.0

bool	language_model_ngram_space_delimited_language = true

int	language_model_min_compound_length = 3

double	language_model_penalty_non_freq_dict_word = 0.1

double	language_model_penalty_non_dict_word = 0.15

double	language_model_penalty_punc = 0.2

double	language_model_penalty_case = 0.1

double	language_model_penalty_script = 0.5

double	language_model_penalty_chartype = 0.3

double	language_model_penalty_font = 0.00

double	language_model_penalty_spacing = 0.05

double	language_model_penalty_increment = 0.01

int	wordrec_display_segmentations = 0

bool	language_model_use_sigmoidal_certainty = false

Static Public Attributes
static const LanguageModelFlagsType	kSmallestRatingFlag = 0x1

static const LanguageModelFlagsType	kLowerCaseFlag = 0x2

static const LanguageModelFlagsType	kUpperCaseFlag = 0x4

static const LanguageModelFlagsType	kDigitFlag = 0x8

static const LanguageModelFlagsType	kXhtConsistentFlag = 0x10

static const float	kMaxAvgNgramCost = 25.0f

Protected Member Functions
float	CertaintyScore (float cert)

float	ComputeAdjustment (int num_problems, float penalty)

float	ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)

float	ComputeAdjustedPathCost (ViterbiStateEntry *vse)

bool	GetTopLowerUpperDigit (BLOB_CHOICE_LIST curr_list, BLOB_CHOICE first_lower, BLOB_CHOICE first_upper, BLOB_CHOICE *first_digit) const

int	SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const

ViterbiStateEntry *	GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES word_res, ViterbiStateEntry_IT vse_it, LanguageModelFlagsType top_choice_flags) const

bool	AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE b, LanguageModelState curr_state, ViterbiStateEntry parent_vse, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle *blamer_bundle)

void	GenerateTopChoiceInfo (ViterbiStateEntry new_vse, const ViterbiStateEntry parent_vse, LanguageModelState *lms)

LanguageModelDawgInfo *	GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)

LanguageModelNgramInfo *	GenerateNgramInfo (const char unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry parent_vse)

float	ComputeNgramCost (const char unichar, float certainty, float denom, const char context, int unichar_step_len, bool found_small_prob, float *ngram_prob)

float	ComputeDenom (BLOB_CHOICE_LIST *curr_list)

void	FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE b, ViterbiStateEntry parent_vse, WERD_RES word_res, LMConsistencyInfo consistency_info)

void	UpdateBestChoice (ViterbiStateEntry vse, LMPainPoints pain_points, WERD_RES word_res, BestChoiceBundle best_choice_bundle, BlamerBundle *blamer_bundle)

WERD_CHOICE *	ConstructWord (ViterbiStateEntry vse, WERD_RES word_res, DANGERR fixpt, BlamerBundle blamer_bundle, bool *truth_path)

void	ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry parent_vse, WERD_RES word_res, AssociateStats *associate_stats)

bool	PrunablePath (const ViterbiStateEntry &vse)

bool	AcceptablePath (const ViterbiStateEntry &vse)

Protected Attributes
DawgArgs	dawg_args_

float	rating_cert_scale_

const UnicityTable< FontInfo > *	fontinfo_table_

Dict *	dict_

bool	fixed_pitch_

float	max_char_wh_ratio_

STRING	prev_word_str_

int	prev_word_unichar_step_len_

DawgPositionVector	very_beginning_active_dawgs_

DawgPositionVector	beginning_active_dawgs_

bool	acceptable_choice_found_

bool	correct_segmentation_explored_

ParamsModel	params_model_

Detailed Description

Definition at line 42 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel	(	const UnicityTable< FontInfo > *	fontinfo_table,
		Dict *	dict
	)

Definition at line 44 of file language_model.cpp.

   : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
                dict->getCCUtil()->params()),
     BOOL_INIT_MEMBER(language_model_ngram_on, false,
                      "Turn on/off the use of character ngram model",
                      dict->getCCUtil()->params()),
     INT_MEMBER(language_model_ngram_order, 8,
                "Maximum order of the character ngram model",
                dict->getCCUtil()->params()),
     INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
                "Maximum number of prunable (those for which"
                " PrunablePath() is true) entries in each viterbi list"
                " recorded in BLOB_CHOICEs",
                dict->getCCUtil()->params()),
     INT_MEMBER(language_model_viterbi_list_max_size, 500,
                "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
                dict->getCCUtil()->params()),
     double_MEMBER(language_model_ngram_small_prob, 0.000001,
                   "To avoid overly small denominators use this as the "
                   "floor of the probability returned by the ngram model.",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
                   "Average classifier score of a non-matching unichar.",
                   dict->getCCUtil()->params()),
     BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
                 "Use only the first UTF8 step of the given string"
                 " when computing log probabilities.",
                 dict->getCCUtil()->params()),
     double_MEMBER(language_model_ngram_scale_factor, 0.03,
                   "Strength of the character ngram model relative to the"
                   " character classifier ",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_ngram_rating_factor, 16.0,
                   "Factor to bring log-probs into the same range as ratings"
                   " when multiplied by outline length ",
                   dict->getCCUtil()->params()),
     BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
                 "Words are delimited by space",
                 dict->getCCUtil()->params()),
     INT_MEMBER(language_model_min_compound_length, 3,
                "Minimum length of compound words",
                dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
                   "Penalty for words not in the frequent word dictionary",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_non_dict_word, 0.15,
                   "Penalty for non-dictionary words",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_punc, 0.2,
                   "Penalty for inconsistent punctuation",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_case, 0.1,
                   "Penalty for inconsistent case",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_script, 0.5,
                   "Penalty for inconsistent script",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_chartype, 0.3,
                   "Penalty for inconsistent character type",
                   dict->getCCUtil()->params()),
     // TODO(daria, rays): enable font consistency checking
     // after improving font analysis.
     double_MEMBER(language_model_penalty_font, 0.00,
                   "Penalty for inconsistent font",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_spacing, 0.05,
                   "Penalty for inconsistent spacing",
                   dict->getCCUtil()->params()),
     double_MEMBER(language_model_penalty_increment, 0.01,
                   "Penalty increment",
                   dict->getCCUtil()->params()),
     INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
                dict->getCCUtil()->params()),
     BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
                      "Use sigmoidal score for certainty",
                      dict->getCCUtil()->params()),
   dawg_args_(NULL, new DawgPositionVector(), NO_PERM),
   fontinfo_table_(fontinfo_table), dict_(dict),
   fixed_pitch_(false), max_char_wh_ratio_(0.0),
   acceptable_choice_found_(false) {
   ASSERT_HOST(dict_ != NULL);
 }

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 128 of file language_model.cpp.

                               {
   delete dawg_args_.updated_dawgs;
 }

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )

inline

Definition at line 95 of file language_model.h.

95 { return acceptable_choice_found_; }

tesseract::LanguageModel::acceptable_choice_found_

bool acceptable_choice_found_

Definition: language_model.h:408

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry & vse )

inlineprotected

Definition at line 301 of file language_model.h.

                                                            {
     return (vse.dawg_info != NULL || vse.Consistent() ||
             (vse.ngram_info != NULL && !vse.ngram_info->pruned));
   }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry	(	LanguageModelFlagsType	top_choice_flags,
		float	denom,
		bool	word_end,
		int	curr_col,
		int	curr_row,
		BLOB_CHOICE *	b,
		LanguageModelState *	curr_state,
		ViterbiStateEntry *	parent_vse,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

protected

Definition at line 557 of file language_model.cpp.

                                  {
   ViterbiStateEntry_IT vit;
   if (language_model_debug_level > 1) {
     tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
             " certainty=%.4f top_choice_flags=0x%x",
             dict_->getUnicharset().id_to_unichar(b->unichar_id()),
             b->rating(), b->certainty(), top_choice_flags);
     if (language_model_debug_level > 5)
       tprintf(" parent_vse=%p\n", parent_vse);
     else
       tprintf("\n");
   }
   // Check whether the list is full.
   if (curr_state != NULL &&
       curr_state->viterbi_state_entries_length >=
           language_model_viterbi_list_max_size) {
     if (language_model_debug_level > 1) {
       tprintf("AddViterbiStateEntry: viterbi list is full!\n");
     }
     return false;
   }
 
   // Invoke Dawg language model component.
   LanguageModelDawgInfo *dawg_info =
     GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
 
   float outline_length =
       AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);
   // Invoke Ngram language model component.
   LanguageModelNgramInfo *ngram_info = NULL;
   if (language_model_ngram_on) {
     ngram_info = GenerateNgramInfo(
         dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
         denom, curr_col, curr_row, outline_length, parent_vse);
     ASSERT_HOST(ngram_info != NULL);
   }
   bool liked_by_language_model = dawg_info != NULL ||
       (ngram_info != NULL && !ngram_info->pruned);
   // Quick escape if not liked by the language model, can't be consistent
   // xheight, and not top choice.
   if (!liked_by_language_model && top_choice_flags == 0) {
     if (language_model_debug_level > 1) {
       tprintf("Language model components very early pruned this entry\n");
     }
     delete ngram_info;
     delete dawg_info;
     return false;
   }
 
   // Check consistency of the path and set the relevant consistency_info.
   LMConsistencyInfo consistency_info(
     parent_vse != NULL ? &parent_vse->consistency_info : NULL);
   // Start with just the x-height consistency, as it provides significant
   // pruning opportunity.
   consistency_info.ComputeXheightConsistency(
       b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
   // Turn off xheight consistent flag if not consistent.
   if (consistency_info.InconsistentXHeight()) {
     top_choice_flags &= ~kXhtConsistentFlag;
   }
 
   // Quick escape if not liked by the language model, not consistent xheight,
   // and not top choice.
   if (!liked_by_language_model && top_choice_flags == 0) {
     if (language_model_debug_level > 1) {
       tprintf("Language model components early pruned this entry\n");
     }
     delete ngram_info;
     delete dawg_info;
     return false;
   }
 
   // Compute the rest of the consistency info.
   FillConsistencyInfo(curr_col, word_end, b, parent_vse,
                       word_res, &consistency_info);
   if (dawg_info != NULL && consistency_info.invalid_punc) {
     consistency_info.invalid_punc = false;  // do not penalize dict words
   }
 
   // Compute cost of associating the blobs that represent the current unichar.
   AssociateStats associate_stats;
   ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
                         parent_vse, word_res, &associate_stats);
   if (parent_vse != NULL) {
     associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
     associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
   }
 
   // Create the new ViterbiStateEntry compute the adjusted cost of the path.
   ViterbiStateEntry *new_vse = new ViterbiStateEntry(
       parent_vse, b, 0.0, outline_length,
       consistency_info, associate_stats, top_choice_flags, dawg_info,
       ngram_info, (language_model_debug_level > 0) ?
           dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
   new_vse->cost = ComputeAdjustedPathCost(new_vse);
   if (language_model_debug_level >= 3)
     tprintf("Adjusted cost = %g\n", new_vse->cost);
 
   // Invoke Top Choice language model component to make the final adjustments
   // to new_vse->top_choice_flags.
   if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
     GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
   }
 
   // If language model components did not like this unichar - return.
   bool keep = new_vse->top_choice_flags || liked_by_language_model;
   if (!(top_choice_flags & kSmallestRatingFlag) &&  // no non-top choice paths
       consistency_info.inconsistent_script) {       // with inconsistent script
     keep = false;
   }
   if (!keep) {
     if (language_model_debug_level > 1) {
       tprintf("Language model components did not like this entry\n");
     }
     delete new_vse;
     return false;
   }
 
   // Discard this entry if it represents a prunable path and
   // language_model_viterbi_list_max_num_prunable such entries with a lower
   // cost have already been recorded.
   if (PrunablePath(*new_vse) &&
       (curr_state->viterbi_state_entries_prunable_length >=
        language_model_viterbi_list_max_num_prunable) &&
       new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
     if (language_model_debug_level > 1) {
       tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
               new_vse->cost,
               curr_state->viterbi_state_entries_prunable_max_cost);
     }
     delete new_vse;
     return false;
   }
 
   // Update best choice if needed.
   if (word_end) {
     UpdateBestChoice(new_vse, pain_points, word_res,
                      best_choice_bundle, blamer_bundle);
     // Discard the entry if UpdateBestChoice() found flaws in it.
     if (new_vse->cost >= WERD_CHOICE::kBadRating &&
         new_vse != best_choice_bundle->best_vse) {
       if (language_model_debug_level > 1) {
         tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
       }
       delete new_vse;
       return false;
     }
   }
 
   // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
   curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
                                                false, new_vse);
   curr_state->viterbi_state_entries_length++;
   if (PrunablePath(*new_vse)) {
     curr_state->viterbi_state_entries_prunable_length++;
   }
 
   // Update lms->viterbi_state_entries_prunable_max_cost and clear
   // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
   if ((curr_state->viterbi_state_entries_prunable_length >=
        language_model_viterbi_list_max_num_prunable) ||
       new_vse->top_choice_flags) {
     ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
     int prunable_counter = language_model_viterbi_list_max_num_prunable;
     vit.set_to_list(&(curr_state->viterbi_state_entries));
     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
       ViterbiStateEntry *curr_vse = vit.data();
       // Clear the appropriate top choice flags of the entries in the
       // list that have cost higher thank new_entry->cost
       // (since they will not be top choices any more).
       if (curr_vse->top_choice_flags && curr_vse != new_vse &&
           curr_vse->cost > new_vse->cost) {
         curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
       }
       if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
       // Update curr_state->viterbi_state_entries_prunable_max_cost.
       if (prunable_counter == 0) {
         curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
         if (language_model_debug_level > 1) {
           tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
                   curr_state->viterbi_state_entries_prunable_max_cost);
         }
         prunable_counter = -1;  // stop counting
       }
     }
   }
 
   // Print the newly created ViterbiStateEntry.
   if (language_model_debug_level > 2) {
     new_vse->Print("New");
     if (language_model_debug_level > 5)
       curr_state->Print("Updated viterbi list");
   }
 
   return true;
 }

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float cert )

inlineprotected

Definition at line 104 of file language_model.h.

                                           {
     if (language_model_use_sigmoidal_certainty) {
       // cert is assumed to be between 0 and -dict_->certainty_scale.
       // If you enable language_model_use_sigmoidal_certainty, you
       // need to adjust language_model_ngram_nonmatch_score as well.
       cert = -cert / dict_->certainty_scale;
       return 1.0f / (1.0f + exp(10.0f * cert));
     } else {
       return (-1.0f / cert);
     }
   }

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry * vse )

protected

Definition at line 1192 of file language_model.cpp.

                                                                    {
   ASSERT_HOST(vse != NULL);
   if (params_model_.Initialized()) {
     float features[PTRAIN_NUM_FEATURE_TYPES];
     ExtractFeaturesFromPath(*vse, features);
     float cost = params_model_.ComputeCost(features);
     if (language_model_debug_level > 3) {
       tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
       if (language_model_debug_level >= 5) {
         for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
           tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
         }
       }
     }
     return cost * vse->outline_length;
   } else {
     float adjustment = 1.0f;
     if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
       adjustment += language_model_penalty_non_freq_dict_word;
     }
     if (vse->dawg_info == NULL) {
       adjustment += language_model_penalty_non_dict_word;
       if (vse->length > language_model_min_compound_length) {
         adjustment += ((vse->length - language_model_min_compound_length) *
             language_model_penalty_increment);
       }
     }
     if (vse->associate_stats.shape_cost > 0) {
       adjustment += vse->associate_stats.shape_cost /
           static_cast<float>(vse->length);
     }
     if (language_model_ngram_on) {
       ASSERT_HOST(vse->ngram_info != NULL);
       return vse->ngram_info->ngram_and_classifier_cost * adjustment;
     } else {
       adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
                                                  vse->consistency_info);
       return vse->ratings_sum * adjustment;
     }
   }
 }

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment	(	int	num_problems,
		float	penalty
	)

inlineprotected

Definition at line 116 of file language_model.h.

                                                                   {
     if (num_problems == 0) return 0.0f;
     if (num_problems == 1) return penalty;
     return (penalty + (language_model_penalty_increment *
                        static_cast<float>(num_problems-1)));
   }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats	(	int	col,
		int	row,
		float	max_char_wh_ratio,
		ViterbiStateEntry *	parent_vse,
		WERD_RES *	word_res,
		AssociateStats *	associate_stats
	)

inlineprotected

Definition at line 272 of file language_model.h.

                                                                      {
     AssociateUtils::ComputeStats(
         col, row,
         (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
         (parent_vse != NULL) ? parent_vse->length : 0,
         fixed_pitch_, max_char_wh_ratio,
         word_res, language_model_debug_level > 2, associate_stats);
   }

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment	(	const LanguageModelDawgInfo *	dawg_info,
		const LMConsistencyInfo &	consistency_info
	)

inlineprotected

Definition at line 127 of file language_model.h.

                                                  {
     if (dawg_info != NULL) {
       return ComputeAdjustment(consistency_info.NumInconsistentCase(),
                                language_model_penalty_case) +
           (consistency_info.inconsistent_script ?
              language_model_penalty_script : 0.0f);
     }
     return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
                               language_model_penalty_punc) +
             ComputeAdjustment(consistency_info.NumInconsistentCase(),
                               language_model_penalty_case) +
             ComputeAdjustment(consistency_info.NumInconsistentChartype(),
                               language_model_penalty_chartype) +
             ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
                               language_model_penalty_spacing) +
             (consistency_info.inconsistent_script ?
              language_model_penalty_script : 0.0f) +
             (consistency_info.inconsistent_font ?
              language_model_penalty_font : 0.0f));
   }

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST * curr_list )

protected

Definition at line 989 of file language_model.cpp.

                                                              {
   if (curr_list->empty()) return 1.0f;
   float denom = 0.0f;
   int len = 0;
   BLOB_CHOICE_IT c_it(curr_list);
   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
     ASSERT_HOST(c_it.data() != NULL);
     ++len;
     denom += CertaintyScore(c_it.data()->certainty());
   }
   assert(len != 0);
   // The ideal situation would be to have the classifier scores for
   // classifying each position as each of the characters in the unicharset.
   // Since we can not do this because of speed, we add a very crude estimate
   // of what these scores for the "missing" classifications would sum up to.
   denom += (dict_->getUnicharset().size() - len) *
     CertaintyScore(language_model_ngram_nonmatch_score);
 
   return denom;
 }

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost	(	const char *	unichar,
		float	certainty,
		float	denom,
		const char *	context,
		int *	unichar_step_len,
		bool *	found_small_prob,
		float *	ngram_prob
	)

protected

Definition at line 929 of file language_model.cpp.

                                                          {
   const char *context_ptr = context;
   char *modified_context = NULL;
   char *modified_context_end = NULL;
   const char *unichar_ptr = unichar;
   const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
   float prob = 0.0f;
   int step = 0;
   while (unichar_ptr < unichar_end &&
          (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
     if (language_model_debug_level > 1) {
       tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
               dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
     }
     prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
     ++(*unichar_step_len);
     if (language_model_ngram_use_only_first_uft8_step) break;
     unichar_ptr += step;
     // If there are multiple UTF8 characters present in unichar, context is
     // updated to include the previously examined characters from str,
     // unless use_only_first_uft8_step is true.
     if (unichar_ptr < unichar_end) {
       if (modified_context == NULL) {
         int context_len = strlen(context);
         modified_context =
           new char[context_len + strlen(unichar_ptr) + step + 1];
         strncpy(modified_context, context, context_len);
         modified_context_end = modified_context + context_len;
         context_ptr = modified_context;
       }
       strncpy(modified_context_end, unichar_ptr - step, step);
       modified_context_end += step;
       *modified_context_end = '\0';
     }
   }
   prob /= static_cast<float>(*unichar_step_len);  // normalize
   if (prob < language_model_ngram_small_prob) {
     if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
     *found_small_prob = true;
     prob = language_model_ngram_small_prob;
   }
   *ngram_cost = -1.0*log2(prob);
   float ngram_and_classifier_cost =
       -1.0*log2(CertaintyScore(certainty)/denom) +
       *ngram_cost * language_model_ngram_scale_factor;
   if (language_model_debug_level > 1) {
     tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
             unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
             ngram_and_classifier_cost);
   }
   delete[] modified_context;
   return ngram_and_classifier_cost;
 }

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord	(	ViterbiStateEntry *	vse,
		WERD_RES *	word_res,
		DANGERR *	fixpt,
		BlamerBundle *	blamer_bundle,
		bool *	truth_path
	)

protected

Definition at line 1383 of file language_model.cpp.

                       {
   if (truth_path != NULL) {
     *truth_path =
         (blamer_bundle != NULL &&
          vse->length == blamer_bundle->correct_segmentation_length());
   }
   BLOB_CHOICE *curr_b = vse->curr_b;
   ViterbiStateEntry *curr_vse = vse;
 
   int i;
   bool compound = dict_->hyphenated();  // treat hyphenated words as compound
 
   // Re-compute the variance of the width-to-height ratios (since we now
   // can compute the mean over the whole word).
   float full_wh_ratio_mean = 0.0f;
   if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
     vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
     full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
                           static_cast<float>(vse->length));
     vse->associate_stats.full_wh_ratio_var = 0.0f;
   }
 
   // Construct a WERD_CHOICE by tracing parent pointers.
   WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
   word->set_length(vse->length);
   int total_blobs = 0;
   for (i = (vse->length-1); i >= 0; --i) {
     if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
         !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
         *truth_path = false;
     }
     // The number of blobs used for this choice is row - col + 1.
     int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
     total_blobs += num_blobs;
     word->set_blob_choice(i, num_blobs, curr_b);
     // Update the width-to-height ratio variance. Useful non-space delimited
     // languages to ensure that the blobs are of uniform width.
     // Skip leading and trailing punctuation when computing the variance.
     if ((full_wh_ratio_mean != 0.0f &&
          ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
           !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
       vse->associate_stats.full_wh_ratio_var +=
         pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
       if (language_model_debug_level > 2) {
         tprintf("full_wh_ratio_var += (%g-%g)^2\n",
                 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
       }
     }
 
     // Mark the word as compound if compound permuter was set for any of
     // the unichars on the path (usually this will happen for unichars
     // that are compounding operators, like "-" and "/").
     if (!compound && curr_vse->dawg_info &&
         curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
 
     // Update curr_* pointers.
     curr_vse = curr_vse->parent_vse;
     if (curr_vse == NULL) break;
     curr_b = curr_vse->curr_b;
   }
   ASSERT_HOST(i == 0);  // check that we recorded all the unichar ids.
   ASSERT_HOST(total_blobs == word_res->ratings->dimension());
   // Re-adjust shape cost to include the updated width-to-height variance.
   if (full_wh_ratio_mean != 0.0f) {
     vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
   }
 
   word->set_rating(vse->ratings_sum);
   word->set_certainty(vse->min_certainty);
   word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
                       vse->consistency_info.BodyMaxXHeight());
   if (vse->dawg_info != NULL) {
     word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
   } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
     word->set_permuter(NGRAM_PERM);
   } else if (vse->top_choice_flags) {
     word->set_permuter(TOP_CHOICE_PERM);
   } else {
     word->set_permuter(NO_PERM);
   }
   word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
                                                             word_res->ratings));
   return word;
 }

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath	(	const ViterbiStateEntry &	vse,
		float	features[]
	)

static

Definition at line 1334 of file language_model.cpp.

                                                     {
   memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
   // Record dictionary match info.
   int len = vse.length <= kMaxSmallWordUnichars ? 0 :
       vse.length <= kMaxMediumWordUnichars ? 1 : 2;
   if (vse.dawg_info != NULL) {
     int permuter = vse.dawg_info->permuter;
     if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
       if (vse.consistency_info.num_digits == vse.length) {
         features[PTRAIN_DIGITS_SHORT+len] = 1.0;
       } else {
         features[PTRAIN_NUM_SHORT+len] = 1.0;
       }
     } else if (permuter == DOC_DAWG_PERM) {
       features[PTRAIN_DOC_SHORT+len] = 1.0;
     } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
         permuter == COMPOUND_PERM) {
       features[PTRAIN_DICT_SHORT+len] = 1.0;
     } else if (permuter == FREQ_DAWG_PERM) {
       features[PTRAIN_FREQ_SHORT+len] = 1.0;
     }
   }
   // Record shape cost feature (normalized by path length).
   features[PTRAIN_SHAPE_COST_PER_CHAR] =
       vse.associate_stats.shape_cost / static_cast<float>(vse.length);
   // Record ngram cost. (normalized by the path length).
   features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
   if (vse.ngram_info != NULL) {
     features[PTRAIN_NGRAM_COST_PER_CHAR] =
         vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
   }
   // Record consistency-related features.
   // Disabled this feature for due to its poor performance.
   // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
   features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
   features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
   features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
       vse.consistency_info.NumInconsistentChartype() : 0.0;
   features[PTRAIN_NUM_BAD_SPACING] =
       vse.consistency_info.NumInconsistentSpaces();
   // Disabled this feature for now due to its poor performance.
   // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
 
   // Classifier-related features.
   features[PTRAIN_RATING_PER_CHAR] =
       vse.ratings_sum / static_cast<float>(vse.outline_length);
 }

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo	(	int	curr_col,
		bool	word_end,
		BLOB_CHOICE *	b,
		ViterbiStateEntry *	parent_vse,
		WERD_RES *	word_res,
		LMConsistencyInfo *	consistency_info
	)

protected

Definition at line 1010 of file language_model.cpp.

                                          {
   const UNICHARSET &unicharset = dict_->getUnicharset();
   UNICHAR_ID unichar_id = b->unichar_id();
   BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
 
   // Check punctuation validity.
   if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
   if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
     if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
         (unicharset.get_isalpha(parent_b->unichar_id()) ||
          unicharset.get_isdigit(parent_b->unichar_id()))) {
       // reset punc_ref for compound words
       consistency_info->punc_ref = NO_EDGE;
     } else {
       bool is_apos = dict_->is_apostrophe(unichar_id);
       bool prev_is_numalpha = (parent_b != NULL &&
           (unicharset.get_isalpha(parent_b->unichar_id()) ||
            unicharset.get_isdigit(parent_b->unichar_id())));
       UNICHAR_ID pattern_unichar_id =
         (unicharset.get_isalpha(unichar_id) ||
          unicharset.get_isdigit(unichar_id) ||
          (is_apos && prev_is_numalpha)) ?
         Dawg::kPatternUnicharID : unichar_id;
       if (consistency_info->punc_ref == NO_EDGE ||
           pattern_unichar_id != Dawg::kPatternUnicharID ||
           dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
           Dawg::kPatternUnicharID) {
         NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),
                                               consistency_info->punc_ref);
         consistency_info->punc_ref =
           (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
               node, pattern_unichar_id, word_end) : NO_EDGE;
         if (consistency_info->punc_ref == NO_EDGE) {
           consistency_info->invalid_punc = true;
         }
       }
     }
   }
 
   // Update case related counters.
   if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
     // Reset counters if we are dealing with a compound word.
     consistency_info->num_lower = 0;
     consistency_info->num_non_first_upper = 0;
   }
   else if (unicharset.get_islower(unichar_id)) {
     consistency_info->num_lower++;
   } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
     if (unicharset.get_isupper(parent_b->unichar_id()) ||
         consistency_info->num_lower > 0 ||
         consistency_info->num_non_first_upper > 0) {
       consistency_info->num_non_first_upper++;
     }
   }
 
   // Initialize consistency_info->script_id (use script of unichar_id
   // if it is not Common, use script id recorded by the parent otherwise).
   // Set inconsistent_script to true if the script of the current unichar
   // is not consistent with that of the parent.
   consistency_info->script_id = unicharset.get_script(unichar_id);
   // Hiragana and Katakana can mix with Han.
   if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
     if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
          consistency_info->script_id == unicharset.hiragana_sid()) ||
         (unicharset.katakana_sid() != unicharset.null_sid() &&
          consistency_info->script_id == unicharset.katakana_sid())) {
       consistency_info->script_id = dict_->getUnicharset().han_sid();
     }
   }
 
   if (parent_vse != NULL &&
       (parent_vse->consistency_info.script_id !=
        dict_->getUnicharset().common_sid())) {
     int parent_script_id = parent_vse->consistency_info.script_id;
     // If script_id is Common, use script id of the parent instead.
     if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
       consistency_info->script_id = parent_script_id;
     }
     if (consistency_info->script_id != parent_script_id) {
       consistency_info->inconsistent_script = true;
     }
   }
 
   // Update chartype related counters.
   if (unicharset.get_isalpha(unichar_id)) {
     consistency_info->num_alphas++;
   } else if (unicharset.get_isdigit(unichar_id)) {
     consistency_info->num_digits++;
   } else if (!unicharset.get_ispunctuation(unichar_id)) {
     consistency_info->num_other++;
   }
 
   // Check font and spacing consistency.
   if (fontinfo_table_->size() > 0 && parent_b != NULL) {
     int fontinfo_id = -1;
     if (parent_b->fontinfo_id() == b->fontinfo_id() ||
         parent_b->fontinfo_id2() == b->fontinfo_id()) {
       fontinfo_id = b->fontinfo_id();
     } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
                 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
       fontinfo_id = b->fontinfo_id2();
     }
     if(language_model_debug_level > 1) {
       tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
               (parent_b->fontinfo_id() >= 0) ?
                   fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
               (parent_b->fontinfo_id2() >= 0) ?
                   fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
               (b->fontinfo_id() >= 0) ?
                   fontinfo_table_->get(b->fontinfo_id()).name : "",
               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
               fontinfo_id);
     }
     if (!word_res->blob_widths.empty()) {  // if we have widths/gaps info
       bool expected_gap_found = false;
       float expected_gap;
       int temp_gap;
       if (fontinfo_id >= 0) {  // found a common font
         ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
         if (fontinfo_table_->get(fontinfo_id).get_spacing(
             parent_b->unichar_id(), unichar_id, &temp_gap)) {
           expected_gap = temp_gap;
           expected_gap_found = true;
         }
       } else {
         consistency_info->inconsistent_font = true;
         // Get an average of the expected gaps in each font
         int num_addends = 0;
         expected_gap = 0;
         int temp_fid;
         for (int i = 0; i < 4; ++i) {
           if (i == 0) {
             temp_fid = parent_b->fontinfo_id();
           } else if (i == 1) {
             temp_fid = parent_b->fontinfo_id2();
           } else if (i == 2) {
             temp_fid = b->fontinfo_id();
           } else {
             temp_fid = b->fontinfo_id2();
           }
           ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
           if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
               parent_b->unichar_id(), unichar_id, &temp_gap)) {
             expected_gap += temp_gap;
             num_addends++;
           }
         }
         expected_gap_found = (num_addends > 0);
         if (num_addends > 0) {
           expected_gap /= static_cast<float>(num_addends);
         }
       }
       if (expected_gap_found) {
         float actual_gap =
             static_cast<float>(word_res->GetBlobsGap(curr_col-1));
         float gap_ratio = expected_gap / actual_gap;
         // TODO(rays) The gaps seem to be way off most of the time, saved by
         // the error here that the ratio was compared to 1/2, when it should
         // have been 0.5f. Find the source of the gaps discrepancy and put
         // the 0.5f here in place of 0.0f.
         // Test on 2476595.sj, pages 0 to 6. (In French.)
         if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
           consistency_info->num_inconsistent_spaces++;
         }
         if (language_model_debug_level > 1) {
           tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
                   unicharset.id_to_unichar(parent_b->unichar_id()),
                   parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
                   unichar_id, curr_col, expected_gap, actual_gap);
         }
       }
     }
   }
 }

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo	(	bool	word_end,
		int	curr_col,
		int	curr_row,
		const BLOB_CHOICE &	b,
		const ViterbiStateEntry *	parent_vse
	)

protected

Definition at line 781 of file language_model.cpp.

                                          {
   // Initialize active_dawgs from parent_vse if it is not NULL.
   // Otherwise use very_beginning_active_dawgs_.
   if (parent_vse == NULL) {
     dawg_args_.active_dawgs = &very_beginning_active_dawgs_;
     dawg_args_.permuter = NO_PERM;
   } else {
     if (parent_vse->dawg_info == NULL) return NULL;  // not a dict word path
     dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
     dawg_args_.permuter = parent_vse->dawg_info->permuter;
   }
 
   // Deal with hyphenated words.
   if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
     if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
     return new LanguageModelDawgInfo(dawg_args_.active_dawgs,
                                      COMPOUND_PERM);
   }
 
   // Deal with compound words.
   if (dict_->compound_marker(b.unichar_id()) &&
       (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
     if (language_model_debug_level > 0) tprintf("Found compound marker\n");
     // Do not allow compound operators at the beginning and end of the word.
     // Do not allow more than one compound operator per word.
     // Do not allow compounding of words with lengths shorter than
     // language_model_min_compound_length
     if (parent_vse == NULL || word_end ||
         dawg_args_.permuter == COMPOUND_PERM ||
         parent_vse->length < language_model_min_compound_length) return NULL;
 
     int i;
     // Check a that the path terminated before the current character is a word.
     bool has_word_ending = false;
     for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
       const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
       const Dawg *pdawg = pos.dawg_index < 0
           ? NULL : dict_->GetDawg(pos.dawg_index);
       if (pdawg == NULL || pos.back_to_punc) continue;;
       if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
           pdawg->end_of_word(pos.dawg_ref)) {
         has_word_ending = true;
         break;
       }
     }
     if (!has_word_ending) return NULL;
 
     if (language_model_debug_level > 0) tprintf("Compound word found\n");
     return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
   }  // done dealing with compound words
 
   LanguageModelDawgInfo *dawg_info = NULL;
 
   // Call LetterIsOkay().
   // Use the normalized IDs so that all shapes of ' can be allowed in words
   // like don't.
   const GenericVector<UNICHAR_ID>& normed_ids =
       dict_->getUnicharset().normed_ids(b.unichar_id());
   DawgPositionVector tmp_active_dawgs;
   for (int i = 0; i < normed_ids.size(); ++i) {
     if (language_model_debug_level > 2)
       tprintf("Test Letter OK for unichar %d, normed %d\n",
               b.unichar_id(), normed_ids[i]);
     dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
                         word_end && i == normed_ids.size() - 1);
     if (dawg_args_.permuter == NO_PERM) {
       break;
     } else if (i < normed_ids.size() - 1) {
       tmp_active_dawgs = *dawg_args_.updated_dawgs;
       dawg_args_.active_dawgs = &tmp_active_dawgs;
     }
     if (language_model_debug_level > 2)
       tprintf("Letter was OK for unichar %d, normed %d\n",
               b.unichar_id(), normed_ids[i]);
   }
   dawg_args_.active_dawgs = NULL;
   if (dawg_args_.permuter != NO_PERM) {
     dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
                                           dawg_args_.permuter);
   } else if (language_model_debug_level > 3) {
     tprintf("Letter %s not OK!\n",
             dict_->getUnicharset().id_to_unichar(b.unichar_id()));
   }
 
   return dawg_info;
 }

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo	(	const char *	unichar,
		float	certainty,
		float	denom,
		int	curr_col,
		int	curr_row,
		float	outline_length,
		const ViterbiStateEntry *	parent_vse
	)

protected

Definition at line 872 of file language_model.cpp.

                                          {
   // Initialize parent context.
   const char *pcontext_ptr = "";
   int pcontext_unichar_step_len = 0;
   if (parent_vse == NULL) {
     pcontext_ptr = prev_word_str_.string();
     pcontext_unichar_step_len = prev_word_unichar_step_len_;
   } else {
     pcontext_ptr = parent_vse->ngram_info->context.string();
     pcontext_unichar_step_len =
       parent_vse->ngram_info->context_unichar_step_len;
   }
   // Compute p(unichar | parent context).
   int unichar_step_len = 0;
   bool pruned = false;
   float ngram_cost;
   float ngram_and_classifier_cost =
       ComputeNgramCost(unichar, certainty, denom,
                        pcontext_ptr, &unichar_step_len,
                        &pruned, &ngram_cost);
   // Normalize just the ngram_and_classifier_cost by outline_length.
   // The ngram_cost is used by the params_model, so it needs to be left as-is,
   // and the params model cost will be normalized by outline_length.
   ngram_and_classifier_cost *=
       outline_length / language_model_ngram_rating_factor;
   // Add the ngram_cost of the parent.
   if (parent_vse != NULL) {
     ngram_and_classifier_cost +=
         parent_vse->ngram_info->ngram_and_classifier_cost;
     ngram_cost += parent_vse->ngram_info->ngram_cost;
   }
 
   // Shorten parent context string by unichar_step_len unichars.
   int num_remove = (unichar_step_len + pcontext_unichar_step_len -
                     language_model_ngram_order);
   if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
   while (num_remove > 0 && *pcontext_ptr != '\0') {
     pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
     --num_remove;
   }
 
   // Decide whether to prune this ngram path and update changed accordingly.
   if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
 
   // Construct and return the new LanguageModelNgramInfo.
   LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
       pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
       ngram_and_classifier_cost);
   ngram_info->context += unichar;
   ngram_info->context_unichar_step_len += unichar_step_len;
   assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
   return ngram_info;
 }

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo	(	ViterbiStateEntry *	new_vse,
		const ViterbiStateEntry *	parent_vse,
		LanguageModelState *	lms
	)

protected

Definition at line 765 of file language_model.cpp.

                                                                    {
   ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
   for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
        new_vse->cost >= vit.data()->cost; vit.forward()) {
     // Clear the appropriate flags if the list already contains
     // a top choice entry with a lower cost.
     new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
   }
   if (language_model_debug_level > 2) {
     tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
             new_vse->top_choice_flags);
   }
 }

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE	(	bool	just_classified,
		bool	mixed_alnum,
		const BLOB_CHOICE *	bc,
		LanguageModelFlagsType	blob_choice_flags,
		const UNICHARSET &	unicharset,
		WERD_RES *	word_res,
		ViterbiStateEntry_IT *	vse_it,
		LanguageModelFlagsType *	top_choice_flags
	)		const

protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 496 of file language_model.cpp.

                                                     {
   for (; !vse_it->cycled_list(); vse_it->forward()) {
     ViterbiStateEntry* parent_vse = vse_it->data();
     // Only consider the parent if it has been updated or
     // if the current ratings cell has just been classified.
     if (!just_classified && !parent_vse->updated) continue;
     if (language_model_debug_level > 2)
       parent_vse->Print("Considering");
     // If the parent is non-alnum, then upper counts as lower.
     *top_choice_flags = blob_choice_flags;
     if ((blob_choice_flags & kUpperCaseFlag) &&
         !parent_vse->HasAlnumChoice(unicharset)) {
       *top_choice_flags |= kLowerCaseFlag;
     }
     *top_choice_flags &= parent_vse->top_choice_flags;
     UNICHAR_ID unichar_id = bc->unichar_id();
     const BLOB_CHOICE* parent_b = parent_vse->curr_b;
     UNICHAR_ID parent_id = parent_b->unichar_id();
     // Digits do not bind to alphas if there is a mix in both parent and current
     // or if the alpha is not the top choice.
     if (unicharset.get_isdigit(unichar_id) &&
         unicharset.get_isalpha(parent_id) &&
         (mixed_alnum || *top_choice_flags == 0))
       continue;  // Digits don't bind to alphas.
     // Likewise alphas do not bind to digits if there is a mix in both or if
     // the digit is not the top choice.
     if (unicharset.get_isalpha(unichar_id) &&
         unicharset.get_isdigit(parent_id) &&
         (mixed_alnum || *top_choice_flags == 0))
       continue;  // Alphas don't bind to digits.
     // If there is a case mix of the same alpha in the parent list, then
     // competing_vse is non-null and will be used to determine whether
     // or not to bind the current blob choice.
     if (parent_vse->competing_vse != NULL) {
       const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
       UNICHAR_ID other_id = competing_b->unichar_id();
       if (language_model_debug_level >= 5) {
         tprintf("Parent %s has competition %s\n",
                 unicharset.id_to_unichar(parent_id),
                 unicharset.id_to_unichar(other_id));
       }
       if (unicharset.SizesDistinct(parent_id, other_id)) {
         // If other_id matches bc wrt position and size, and parent_id, doesn't,
         // don't bind to the current parent.
         if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
                                 language_model_debug_level >= 5) &&
             !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
                                 language_model_debug_level >= 5))
           continue;  // Competing blobchoice has a better vertical match.
       }
     }
     vse_it->forward();
     return parent_vse;  // This one is good!
   }
   return NULL;  // Ran out of possibilities.
 }

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )

inline

Definition at line 100 of file language_model.h.

100 { return params_model_; }

tesseract::LanguageModel::params_model_

ParamsModel params_model_

Definition: language_model.h:413

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit	(	BLOB_CHOICE_LIST *	curr_list,
		BLOB_CHOICE **	first_lower,
		BLOB_CHOICE **	first_upper,
		BLOB_CHOICE **	first_digit
	)		const

protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 379 of file language_model.cpp.

                                                                            {
   BLOB_CHOICE_IT c_it(curr_list);
   const UNICHARSET &unicharset = dict_->getUnicharset();
   BLOB_CHOICE *first_unichar = NULL;
   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
     UNICHAR_ID unichar_id = c_it.data()->unichar_id();
     if (unicharset.get_fragment(unichar_id)) continue;  // skip fragments
     if (first_unichar == NULL) first_unichar = c_it.data();
     if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
       *first_lower = c_it.data();
     }
     if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
         !unicharset.get_islower(unichar_id)) {
       *first_upper = c_it.data();
     }
     if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
       *first_digit = c_it.data();
     }
   }
   ASSERT_HOST(first_unichar != NULL);
   bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
       *first_digit != NULL;
   if (*first_lower == NULL) *first_lower = first_unichar;
   if (*first_upper == NULL) *first_upper = first_unichar;
   if (*first_digit == NULL) *first_digit = first_unichar;
   return mixed;
 }

◆ InitForWord()

void tesseract::LanguageModel::InitForWord	(	const WERD_CHOICE *	prev_word,
		bool	fixed_pitch,
		float	max_char_wh_ratio,
		float	rating_cert_scale
	)

Definition at line 132 of file language_model.cpp.

                                                          {
   fixed_pitch_ = fixed_pitch;
   max_char_wh_ratio_ = max_char_wh_ratio;
   rating_cert_scale_ = rating_cert_scale;
   acceptable_choice_found_ = false;
   correct_segmentation_explored_ = false;
 
   // Initialize vectors with beginning DawgInfos.
   very_beginning_active_dawgs_.clear();
   dict_->init_active_dawgs(&very_beginning_active_dawgs_, false);
   beginning_active_dawgs_.clear();
   dict_->default_dawgs(&beginning_active_dawgs_, false);
 
   // Fill prev_word_str_ with the last language_model_ngram_order
   // unichars from prev_word.
   if (language_model_ngram_on) {
     if (prev_word != NULL && prev_word->unichar_string() != NULL) {
       prev_word_str_ = prev_word->unichar_string();
       if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
     } else {
       prev_word_str_ = " ";
     }
     const char *str_ptr = prev_word_str_.string();
     const char *str_end = str_ptr + prev_word_str_.length();
     int step;
     prev_word_unichar_step_len_ = 0;
     while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
       str_ptr += step;
       ++prev_word_unichar_step_len_;
     }
     ASSERT_HOST(str_ptr == str_end);
   }
 }

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry & vse )

inlineprotected

Definition at line 291 of file language_model.h.

                                                          {
     if (vse.top_choice_flags) return false;
     if (vse.dawg_info != NULL &&
         (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
          vse.dawg_info->permuter == USER_DAWG_PERM ||
          vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
     return true;
   }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool val )

inline

Definition at line 96 of file language_model.h.

                                                  {
     acceptable_choice_found_ = val;
   }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState * parent_node ) const

protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 419 of file language_model.cpp.

                                            {
   if (parent_node == NULL) return -1;
   UNICHAR_ID top_id = INVALID_UNICHAR_ID;
   ViterbiStateEntry* top_lower = NULL;
   ViterbiStateEntry* top_upper = NULL;
   ViterbiStateEntry* top_digit = NULL;
   ViterbiStateEntry* top_choice = NULL;
   float lower_rating = 0.0f;
   float upper_rating = 0.0f;
   float digit_rating = 0.0f;
   float top_rating = 0.0f;
   const UNICHARSET &unicharset = dict_->getUnicharset();
   ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
   for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
     ViterbiStateEntry* vse = vit.data();
     // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
     // back to the real character if needed.
     ViterbiStateEntry* unichar_vse = vse;
     UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
     float rating = unichar_vse->curr_b->rating();
     while (unichar_id == INVALID_UNICHAR_ID &&
            unichar_vse->parent_vse != NULL) {
       unichar_vse = unichar_vse->parent_vse;
       unichar_id = unichar_vse->curr_b->unichar_id();
       rating = unichar_vse->curr_b->rating();
     }
     if (unichar_id != INVALID_UNICHAR_ID) {
       if (unicharset.get_islower(unichar_id)) {
         if (top_lower == NULL || lower_rating > rating) {
           top_lower = vse;
           lower_rating = rating;
         }
       } else if (unicharset.get_isalpha(unichar_id)) {
         if (top_upper == NULL || upper_rating > rating) {
           top_upper = vse;
           upper_rating = rating;
         }
       } else if (unicharset.get_isdigit(unichar_id)) {
         if (top_digit == NULL || digit_rating > rating) {
           top_digit = vse;
           digit_rating = rating;
         }
       }
     }
     if (top_choice == NULL || top_rating > rating) {
       top_choice = vse;
       top_rating = rating;
       top_id = unichar_id;
     }
   }
   if (top_choice == NULL) return -1;
   bool mixed = (top_lower != NULL || top_upper != NULL) &&
       top_digit != NULL;
   if (top_lower == NULL) top_lower = top_choice;
   top_lower->top_choice_flags |= kLowerCaseFlag;
   if (top_upper == NULL) top_upper = top_choice;
   top_upper->top_choice_flags |= kUpperCaseFlag;
   if (top_digit == NULL) top_digit = top_choice;
   top_digit->top_choice_flags |= kDigitFlag;
   top_choice->top_choice_flags |= kSmallestRatingFlag;
   if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
       (top_choice->top_choice_flags &
           (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
     // If the compound marker top choice carries any of the top alnum flags,
     // then give it all of them, allowing words like I-295 to be chosen.
     top_choice->top_choice_flags |=
         kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
   }
   return mixed ? 1 : 0;
 }

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice	(	ViterbiStateEntry *	vse,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

protected

Definition at line 1234 of file language_model.cpp.

                                  {
   bool truth_path;
   WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
                                     blamer_bundle, &truth_path);
   ASSERT_HOST(word != NULL);
   if (dict_->stopper_debug_level >= 1) {
     STRING word_str;
     word->string_and_lengths(&word_str, NULL);
     vse->Print(word_str.string());
   }
   if (language_model_debug_level > 0) {
     word->print("UpdateBestChoice() constructed word");
   }
   // Record features from the current path if necessary.
   ParamsTrainingHypothesis curr_hyp;
   if (blamer_bundle != NULL) {
     if (vse->dawg_info != NULL) vse->dawg_info->permuter =
         static_cast<PermuterType>(word->permuter());
     ExtractFeaturesFromPath(*vse, curr_hyp.features);
     word->string_and_lengths(&(curr_hyp.str), NULL);
     curr_hyp.cost = vse->cost;  // record cost for error rate computations
     if (language_model_debug_level > 0) {
       tprintf("Raw features extracted from %s (cost=%g) [ ",
               curr_hyp.str.string(), curr_hyp.cost);
       for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
         tprintf("%g ", curr_hyp.features[deb_i]);
       }
       tprintf("]\n");
     }
     // Record the current hypothesis in params_training_bundle.
     blamer_bundle->AddHypothesis(curr_hyp);
     if (truth_path)
       blamer_bundle->UpdateBestRating(word->rating());
   }
   if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
     // The word was constructed solely for blamer_bundle->AddHypothesis, so
     // we no longer need it.
     delete word;
     return;
   }
   if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
     word->SetScriptPositions(false, word_res->chopped_word);
   // Update and log new raw_choice if needed.
   if (word_res->raw_choice == NULL ||
       word->rating() < word_res->raw_choice->rating()) {
     if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
       tprintf("Updated raw choice\n");
   }
   // Set the modified rating for best choice to vse->cost and log best choice.
   word->set_rating(vse->cost);
   // Call LogNewChoice() for best choice from Dict::adjust_word() since it
   // computes adjust_factor that is used by the adaption code (e.g. by
   // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
   // Note: the rating of the word is not adjusted.
   dict_->adjust_word(word, vse->dawg_info == NULL,
                      vse->consistency_info.xht_decision, 0.0,
                      false, language_model_debug_level > 0);
   // Hand ownership of the word over to the word_res.
   if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
                                     dict_->stopper_debug_level >= 1, word)) {
     // The word was so bad that it was deleted.
     return;
   }
   if (word_res->best_choice == word) {
     // Word was the new best.
     if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
         AcceptablePath(*vse)) {
       acceptable_choice_found_ = true;
     }
     // Update best_choice_bundle.
     best_choice_bundle->updated = true;
     best_choice_bundle->best_vse = vse;
     if (language_model_debug_level > 0) {
       tprintf("Updated best choice\n");
       word->print_state("New state ");
     }
     // Update hyphen state if we are dealing with a dictionary word.
     if (vse->dawg_info != NULL) {
       if (dict_->has_hyphen_end(*word)) {
         dict_->set_hyphen_word(*word, *(dawg_args_.active_dawgs));
       } else {
         dict_->reset_hyphen_vars(true);
       }
     }
 
     if (blamer_bundle != NULL) {
       blamer_bundle->set_best_choice_is_dict_and_top_choice(
           vse->dawg_info != NULL && vse->top_choice_flags);
     }
   }
   if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
     word->DisplaySegmentation(word_res->chopped_word);
   }
 }

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState	(	bool	just_classified,
		int	curr_col,
		int	curr_row,
		BLOB_CHOICE_LIST *	curr_list,
		LanguageModelState *	parent_node,
		LMPainPoints *	pain_points,
		WERD_RES *	word_res,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

paths that are liked by the language model: either a DAWG or the n-gram model, where active.
paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 249 of file language_model.cpp.

                                  {
   if (language_model_debug_level > 0) {
     tprintf("\nUpdateState: col=%d row=%d %s",
             curr_col, curr_row, just_classified ? "just_classified" : "");
     if (language_model_debug_level > 5)
       tprintf("(parent=%p)\n", parent_node);
     else
       tprintf("\n");
   }
   // Initialize helper variables.
   bool word_end = (curr_row+1 >= word_res->ratings->dimension());
   bool new_changed = false;
   float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
   const UNICHARSET& unicharset = dict_->getUnicharset();
   BLOB_CHOICE *first_lower = NULL;
   BLOB_CHOICE *first_upper = NULL;
   BLOB_CHOICE *first_digit = NULL;
   bool has_alnum_mix = false;
   if (parent_node != NULL) {
     int result = SetTopParentLowerUpperDigit(parent_node);
     if (result < 0) {
       if (language_model_debug_level > 0)
         tprintf("No parents found to process\n");
       return false;
     }
     if (result > 0)
       has_alnum_mix = true;
   }
   if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
                              &first_digit))
     has_alnum_mix = false;;
   ScanParentsForCaseMix(unicharset, parent_node);
   if (language_model_debug_level > 3 && parent_node != NULL) {
     parent_node->Print("Parent viterbi list");
   }
   LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
 
   // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
   ViterbiStateEntry_IT vit;
   BLOB_CHOICE_IT c_it(curr_list);
   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
     BLOB_CHOICE* choice = c_it.data();
     // TODO(antonova): make sure commenting this out if ok for ngram
     // model scoring (I think this was introduced to fix ngram model quirks).
     // Skip NULL unichars unless it is the only choice.
     //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
     UNICHAR_ID unichar_id = choice->unichar_id();
     if (unicharset.get_fragment(unichar_id)) {
       continue;  // Skip fragments.
     }
     // Set top choice flags.
     LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
     if (c_it.at_first() || !new_changed)
       blob_choice_flags |= kSmallestRatingFlag;
     if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
     if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
     if (first_digit == choice) blob_choice_flags |= kDigitFlag;
 
     if (parent_node == NULL) {
       // Process the beginning of a word.
       // If there is a better case variant that is not distinguished by size,
       // skip this blob choice, as we have no choice but to accept the result
       // of the character classifier to distinguish between them, even if
       // followed by an upper case.
       // With words like iPoc, and other CamelBackWords, the lower-upper
       // transition can only be achieved if the classifier has the correct case
       // as the top choice, and leaving an initial I lower down the list
       // increases the chances of choosing IPoc simply because it doesn't
       // include such a transition. iPoc will beat iPOC and ipoc because
       // the other words are baseline/x-height inconsistent.
       if (HasBetterCaseVariant(unicharset, choice, curr_list))
         continue;
       // Upper counts as lower at the beginning of a word.
       if (blob_choice_flags & kUpperCaseFlag)
         blob_choice_flags |= kLowerCaseFlag;
       new_changed |= AddViterbiStateEntry(
           blob_choice_flags, denom, word_end, curr_col, curr_row,
           choice, curr_state, NULL, pain_points,
           word_res, best_choice_bundle, blamer_bundle);
     } else {
       // Get viterbi entries from each parent ViterbiStateEntry.
       vit.set_to_list(&parent_node->viterbi_state_entries);
       int vit_counter = 0;
       vit.mark_cycle_pt();
       ViterbiStateEntry* parent_vse = NULL;
       LanguageModelFlagsType top_choice_flags;
       while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
                                             c_it.data(), blob_choice_flags,
                                             unicharset, word_res, &vit,
                                             &top_choice_flags)) != NULL) {
         // Skip pruned entries and do not look at prunable entries if already
         // examined language_model_viterbi_list_max_num_prunable of those.
         if (PrunablePath(*parent_vse) &&
             (++vit_counter > language_model_viterbi_list_max_num_prunable ||
              (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
           continue;
         }
         // If the parent has no alnum choice, (ie choice is the first in a
         // string of alnum), and there is a better case variant that is not
         // distinguished by size, skip this blob choice/parent, as with the
         // initial blob treatment above.
         if (!parent_vse->HasAlnumChoice(unicharset) &&
             HasBetterCaseVariant(unicharset, choice, curr_list))
           continue;
         // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
         // looks good according to the Dawgs or character ngram model.
         new_changed |= AddViterbiStateEntry(
             top_choice_flags, denom, word_end, curr_col, curr_row,
             c_it.data(), curr_state, parent_vse, pain_points,
             word_res, best_choice_bundle, blamer_bundle);
       }
     }
   }
   return new_changed;
 }

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_

protected

Definition at line 408 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_

protected

Definition at line 396 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_

protected

Definition at line 410 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_

protected

Definition at line 356 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_

protected

Definition at line 375 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_

protected

Definition at line 382 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_

protected

Definition at line 371 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8

static

Definition at line 48 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2

static

Definition at line 46 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f

static

Definition at line 53 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1

static

Definition at line 45 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4

static

Definition at line 47 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10

static

Definition at line 49 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 308 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 335 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 322 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 310 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 312 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 331 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 328 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 320 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 333 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 325 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 344 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 348 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 350 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 353 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 340 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 338 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 342 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 346 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 352 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 356 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 317 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_

protected

Definition at line 385 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_

protected

Definition at line 413 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_

protected

Definition at line 392 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_

protected

Definition at line 393 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_

protected

Definition at line 366 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_

protected

Definition at line 395 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 354 of file language_model.h.

The documentation for this class was generated from the following files:

wordrec/language_model.h
wordrec/language_model.cpp

Public Member Functions

Static Public Member Functions

Public Attributes

Static Public Attributes

Protected Member Functions

Protected Attributes

Detailed Description

Constructor & Destructor Documentation

◆ LanguageModel()

◆ ~LanguageModel()

Member Function Documentation

◆ AcceptableChoiceFound()

◆ AcceptablePath()

◆ AddViterbiStateEntry()

◆ CertaintyScore()

◆ ComputeAdjustedPathCost()

◆ ComputeAdjustment()

◆ ComputeAssociateStats()

◆ ComputeConsistencyAdjustment()

◆ ComputeDenom()

◆ ComputeNgramCost()

◆ ConstructWord()

◆ ExtractFeaturesFromPath()

◆ FillConsistencyInfo()

◆ GenerateDawgInfo()

◆ GenerateNgramInfo()

◆ GenerateTopChoiceInfo()

◆ GetNextParentVSE()

◆ getParamsModel()

◆ GetTopLowerUpperDigit()

◆ InitForWord()

◆ PrunablePath()

◆ SetAcceptableChoiceFound()

◆ SetTopParentLowerUpperDigit()

◆ UpdateBestChoice()

◆ UpdateState()

Member Data Documentation

◆ acceptable_choice_found_

◆ beginning_active_dawgs_

◆ correct_segmentation_explored_

◆ dawg_args_

◆ dict_

◆ fixed_pitch_

◆ fontinfo_table_

◆ kDigitFlag

◆ kLowerCaseFlag

◆ kMaxAvgNgramCost

◆ kSmallestRatingFlag

◆ kUpperCaseFlag

◆ kXhtConsistentFlag

◆ language_model_debug_level

◆ language_model_min_compound_length

◆ language_model_ngram_nonmatch_score

◆ language_model_ngram_on

◆ language_model_ngram_order

◆ language_model_ngram_rating_factor

◆ language_model_ngram_scale_factor

◆ language_model_ngram_small_prob

◆ language_model_ngram_space_delimited_language

◆ language_model_ngram_use_only_first_uft8_step

◆ language_model_penalty_case

◆ language_model_penalty_chartype

◆ language_model_penalty_font

◆ language_model_penalty_increment

◆ language_model_penalty_non_dict_word

◆ language_model_penalty_non_freq_dict_word

◆ language_model_penalty_punc

◆ language_model_penalty_script

◆ language_model_penalty_spacing

◆ language_model_use_sigmoidal_certainty

◆ language_model_viterbi_list_max_num_prunable

◆ language_model_viterbi_list_max_size

◆ max_char_wh_ratio_

◆ params_model_

◆ prev_word_str_

◆ prev_word_unichar_step_len_

◆ rating_cert_scale_

◆ very_beginning_active_dawgs_

◆ wordrec_display_segmentations