tesseract  3.05.02
tesseract::TessLangModel Class Reference

#include <tess_lang_model.h>

Inheritance diagram for tesseract::TessLangModel:
tesseract::LangModel

Public Member Functions

 TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
 
 ~TessLangModel ()
 
TessLangModEdgeRoot ()
 
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
 
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
 
bool IsLeadingPunc (char_32 ch)
 
bool IsTrailingPunc (char_32 ch)
 
bool IsDigit (char_32 ch)
 
void RemoveInvalidCharacters (string *lm_str)
 
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
 
virtual ~LangModel ()
 
bool OOD ()
 
bool Numeric ()
 
bool WordList ()
 
bool Punc ()
 
void SetOOD (bool ood)
 
void SetNumeric (bool numeric)
 
void SetWordList (bool word_list)
 
void SetPunc (bool punc_enabled)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
 
bool numeric_enabled_
 
bool word_list_enabled_
 
bool punc_enabled_
 

Detailed Description

Definition at line 38 of file tess_lang_model.h.

Constructor & Destructor Documentation

◆ TessLangModel()

tesseract::TessLangModel::TessLangModel ( const string &  lm_params,
const string &  data_file_path,
bool  load_system_dawg,
TessdataManager tessdata_manager,
CubeRecoContext cntxt 
)

Definition at line 60 of file tess_lang_model.cpp.

64  {
65  cntxt_ = cntxt;
66  has_case_ = cntxt_->HasCase();
67  // Load the rest of the language model elements from file
68  LoadLangModelElements(lm_params);
69  // Load word_dawgs_ if needed.
70  if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
71  word_dawgs_ = new DawgVector();
72  if (load_system_dawg &&
73  tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) {
74  // The last parameter to the Dawg constructor (the debug level) is set to
75  // false, until Cube has a way to express its preferred debug level.
76  *word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(),
78  cntxt_->Lang().c_str(),
79  SYSTEM_DAWG_PERM, false);
80  }
81  } else {
82  word_dawgs_ = NULL;
83  }
84 }
const string & Lang() const
GenericVector< Dawg * > DawgVector
Definition: dict.h:49

◆ ~TessLangModel()

tesseract::TessLangModel::~TessLangModel ( )
inline

Definition at line 45 of file tess_lang_model.h.

45  {
46  if (word_dawgs_ != NULL) {
47  word_dawgs_->delete_data_pointers();
48  delete word_dawgs_;
49  }
50  }
void delete_data_pointers()

Member Function Documentation

◆ GetEdges()

LangModEdge ** tesseract::TessLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 169 of file tess_lang_model.cpp.

171  {
172  TessLangModEdge *tess_lm_edge =
173  reinterpret_cast<TessLangModEdge *>(lang_mod_edge);
174  LangModEdge **edge_array = NULL;
175  (*edge_cnt) = 0;
176 
177  // if we are starting from the root, we'll instantiate every DAWG
178  // and get the all the edges that emerge from the root
179  if (tess_lm_edge == NULL) {
180  // get DAWG count from Tesseract
181  int dawg_cnt = NumDawgs();
182  // preallocate the edge buffer
183  (*edge_cnt) = dawg_cnt * max_edge_;
184  edge_array = new LangModEdge *[(*edge_cnt)];
185 
186  for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
187  const Dawg *curr_dawg = GetDawg(dawg_idx);
188  // Only look through word Dawgs (since there is a special way of
189  // handling numbers and punctuation).
190  if (curr_dawg->type() == DAWG_TYPE_WORD) {
191  (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true,
192  edge_array + (*edge_cnt));
193  }
194  } // dawg
195 
196  (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true,
197  edge_array + (*edge_cnt));
198 
199  // OOD: it is intentionally not added to the list to make sure it comes
200  // at the end
201  (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true,
202  edge_array + (*edge_cnt));
203 
204  // set the root flag for all root edges
205  for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
206  edge_array[edge_idx]->SetRoot(true);
207  }
208  } else { // not starting at the root
209  // preallocate the edge buffer
210  (*edge_cnt) = max_edge_;
211  // allocate memory for edges
212  edge_array = new LangModEdge *[(*edge_cnt)];
213 
214  // get the FanOut edges from the root of each dawg
215  (*edge_cnt) = FanOut(alt_list,
216  tess_lm_edge->GetDawg(),
217  tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(),
218  tess_lm_edge->EdgeString(), false, edge_array);
219  }
220  return edge_array;
221 }

◆ IsDigit()

bool tesseract::TessLangModel::IsDigit ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 162 of file tess_lang_model.cpp.

162  {
163  return digits_.find(ch) != string::npos;
164 }

◆ IsLeadingPunc()

bool tesseract::TessLangModel::IsLeadingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 154 of file tess_lang_model.cpp.

154  {
155  return lead_punc_.find(ch) != string::npos;
156 }

◆ IsTrailingPunc()

bool tesseract::TessLangModel::IsTrailingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 158 of file tess_lang_model.cpp.

158  {
159  return trail_punc_.find(ch) != string::npos;
160 }

◆ IsValidSequence()

bool tesseract::TessLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  final_edge = NULL 
)
virtual

Implements tesseract::LangModel.

Definition at line 145 of file tess_lang_model.cpp.

146  {
147  if (final_edge != NULL) {
148  (*final_edge) = NULL;
149  }
150 
151  return IsValidSequence(NULL, sequence, eow_flag, final_edge);
152 }
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)

◆ RemoveInvalidCharacters()

void tesseract::TessLangModel::RemoveInvalidCharacters ( string *  lm_str)

Definition at line 467 of file tess_lang_model.cpp.

467  {
468  CharSet *char_set = cntxt_->CharacterSet();
469  tesseract::string_32 lm_str32;
470  CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32);
471 
472  int len = CubeUtils::StrLen(lm_str32.c_str());
473  char_32 *clean_str32 = new char_32[len + 1];
474  int clean_len = 0;
475  for (int i = 0; i < len; ++i) {
476  int class_id = char_set->ClassID((char_32)lm_str32[i]);
477  if (class_id != INVALID_UNICHAR_ID) {
478  clean_str32[clean_len] = lm_str32[i];
479  ++clean_len;
480  }
481  }
482  clean_str32[clean_len] = 0;
483  if (clean_len < len) {
484  lm_str->clear();
485  CubeUtils::UTF32ToUTF8(clean_str32, lm_str);
486  }
487  delete [] clean_str32;
488 }
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:272
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
CharSet * CharacterSet() const
signed int char_32
Definition: string_32.h:40
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256

◆ Root()

TessLangModEdge* tesseract::TessLangModel::Root ( )
inlinevirtual

Implements tesseract::LangModel.

Definition at line 53 of file tess_lang_model.h.

53  {
54  return NULL;
55  }

The documentation for this class was generated from the following files: