tesseract  3.05.02
tesseract::WordListLangModel Class Reference

#include <word_list_lang_model.h>

Inheritance diagram for tesseract::WordListLangModel:
tesseract::LangModel

Public Member Functions

 WordListLangModel (CubeRecoContext *cntxt)
 
 ~WordListLangModel ()
 
LangModEdgeRoot ()
 
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
 
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **edges)
 
bool IsLeadingPunc (char_32 ch)
 
bool IsTrailingPunc (char_32 ch)
 
bool IsDigit (char_32 ch)
 
bool AddString (const char *char_ptr)
 
bool AddString32 (const char_32 *char_32_ptr)
 
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
 
virtual ~LangModel ()
 
bool OOD ()
 
bool Numeric ()
 
bool WordList ()
 
bool Punc ()
 
void SetOOD (bool ood)
 
void SetNumeric (bool numeric)
 
void SetWordList (bool word_list)
 
void SetPunc (bool punc_enabled)
 

Static Public Member Functions

static void WordVariants (const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE *> *word_variants)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
 
bool numeric_enabled_
 
bool word_list_enabled_
 
bool punc_enabled_
 

Detailed Description

Definition at line 39 of file word_list_lang_model.h.

Constructor & Destructor Documentation

◆ WordListLangModel()

tesseract::WordListLangModel::WordListLangModel ( CubeRecoContext cntxt)
explicit

Definition at line 29 of file word_list_lang_model.cpp.

29  {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }

◆ ~WordListLangModel()

tesseract::WordListLangModel::~WordListLangModel ( )

Definition at line 35 of file word_list_lang_model.cpp.

35  {
36  Cleanup();
37 }

Member Function Documentation

◆ AddString()

bool tesseract::WordListLangModel::AddString ( const char *  char_ptr)

Definition at line 160 of file word_list_lang_model.cpp.

160  {
161  if (!init_ && !Init()) { // initialize if necessary
162  return false;
163  }
164 
165  string_32 str32;
166  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
167  if (str32.length() < 1) {
168  return false;
169  }
170  return AddString32(str32.c_str());
171 }
bool AddString32(const char_32 *char_32_ptr)
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256

◆ AddString32()

bool tesseract::WordListLangModel::AddString32 ( const char_32 char_32_ptr)

Definition at line 174 of file word_list_lang_model.cpp.

174  {
175  if (char_32_ptr == NULL) {
176  return false;
177  }
178  // get all the word variants
179  vector<WERD_CHOICE *> word_variants;
180  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
181  char_32_ptr, &word_variants);
182 
183  if (word_variants.size() > 0) {
184  // find the shortest variant
185  int shortest_word = 0;
186  for (int word = 1; word < word_variants.size(); word++) {
187  if (word_variants[shortest_word]->length() >
188  word_variants[word]->length()) {
189  shortest_word = word;
190  }
191  }
192  // only add the shortest grapheme interpretation of string to the word list
193  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
194  }
195  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
196  return true;
197 }
const UNICHARSET * TessUnicharset() const
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE *> *word_variants)
CharSet * CharacterSet() const

◆ GetEdges()

LangModEdge ** tesseract::WordListLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 67 of file word_list_lang_model.cpp.

69  {
70  // initialize if necessary
71  if (init_ == false) {
72  if (Init() == false) {
73  return NULL;
74  }
75  }
76 
77  (*edge_cnt) = 0;
78 
79  EDGE_REF edge_ref;
80 
81  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
82 
83  if (tess_lm_edge == NULL) {
84  edge_ref = 0;
85  } else {
86  edge_ref = tess_lm_edge->EndEdge();
87 
88  // advance node
89  edge_ref = dawg_->next_node(edge_ref);
90  if (edge_ref == 0) {
91  return NULL;
92  }
93  }
94 
95  // allocate memory for edges
96  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
97 
98  // now get all the emerging edges
99  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
100  edge_array + (*edge_cnt));
101 
102  return edge_array;
103 }
inT64 EDGE_REF
Definition: dawg.h:54
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)

◆ IsDigit()

bool tesseract::WordListLangModel::IsDigit ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 58 of file word_list_lang_model.h.

58 { return false; } // not yet implemented

◆ IsLeadingPunc()

bool tesseract::WordListLangModel::IsLeadingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 56 of file word_list_lang_model.h.

56 { return false; } // not yet implemented

◆ IsTrailingPunc()

bool tesseract::WordListLangModel::IsTrailingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 57 of file word_list_lang_model.h.

57 { return false; } // not yet implemented

◆ IsValidSequence()

bool tesseract::WordListLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  edges 
)
virtual

Implements tesseract::LangModel.

Definition at line 107 of file word_list_lang_model.cpp.

108  {
109  return false;
110 }

◆ Root()

LangModEdge * tesseract::WordListLangModel::Root ( )
virtual

Implements tesseract::LangModel.

Definition at line 62 of file word_list_lang_model.cpp.

62  {
63  return NULL;
64 }

◆ WordVariants()

void tesseract::WordListLangModel::WordVariants ( const CharSet char_set,
const UNICHARSET uchset,
string_32  str32,
vector< WERD_CHOICE *> *  word_variants 
)
static

Definition at line 147 of file word_list_lang_model.cpp.

149  {
150  for (int i = 0; i < word_variants->size(); i++) {
151  delete (*word_variants)[i];
152  }
153  word_variants->clear();
154  string_32 prefix_str32;
155  WERD_CHOICE word_so_far(uchset);
156  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
157 }
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE *> *word_variants)
basic_string< char_32 > string_32
Definition: string_32.h:41

The documentation for this class was generated from the following files: