tesseract  3.05.02
word_list_lang_model.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_list_lang_model.cpp
3  * Description: Implementation of the Word List Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 #include <vector>
22 #include "word_list_lang_model.h"
23 #include "cube_utils.h"
24 
25 #include "ratngs.h"
26 #include "trie.h"
27 
28 namespace tesseract {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }
34 
36  Cleanup();
37 }
38 
39 // Cleanup
40 void WordListLangModel::Cleanup() {
41  if (dawg_ != NULL) {
42  delete dawg_;
43  dawg_ = NULL;
44  }
45  init_ = false;
46 }
47 
48 // Initialize the language model
49 bool WordListLangModel::Init() {
50  if (init_ == true) {
51  return true;
52  }
53  // The last parameter to the Trie constructor (the debug level) is set to
54  // false for now, until Cube has a way to express its preferred debug level.
55  dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
56  cntxt_->CharacterSet()->ClassCount(), false);
57  init_ = true;
58  return true;
59 }
60 
61 // return a pointer to the root
63  return NULL;
64 }
65 
66 // return the edges emerging from the current state
68  LangModEdge *edge,
69  int *edge_cnt) {
70  // initialize if necessary
71  if (init_ == false) {
72  if (Init() == false) {
73  return NULL;
74  }
75  }
76 
77  (*edge_cnt) = 0;
78 
79  EDGE_REF edge_ref;
80 
81  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
82 
83  if (tess_lm_edge == NULL) {
84  edge_ref = 0;
85  } else {
86  edge_ref = tess_lm_edge->EndEdge();
87 
88  // advance node
89  edge_ref = dawg_->next_node(edge_ref);
90  if (edge_ref == 0) {
91  return NULL;
92  }
93  }
94 
95  // allocate memory for edges
96  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
97 
98  // now get all the emerging edges
99  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
100  edge_array + (*edge_cnt));
101 
102  return edge_array;
103 }
104 
105 // returns true if the char_32 is supported by the language model
106 // TODO(ahmadab) currently not implemented
108  bool terminal, LangModEdge **edges) {
109  return false;
110 }
111 
112 // Recursive helper function for WordVariants().
113 void WordListLangModel::WordVariants(const CharSet &char_set,
114  string_32 prefix_str32,
115  WERD_CHOICE *word_so_far,
116  string_32 str32,
117  vector<WERD_CHOICE *> *word_variants) {
118  int str_len = str32.length();
119  if (str_len == 0) {
120  if (word_so_far->length() > 0) {
121  word_variants->push_back(new WERD_CHOICE(*word_so_far));
122  }
123  } else {
124  // Try out all the possible prefixes of the str32.
125  for (int len = 1; len <= str_len; len++) {
126  // Check if prefix is supported in character set.
127  string_32 str_pref32 = str32.substr(0, len);
128  int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
129  str_pref32.c_str()));
130  if (class_id <= 0) {
131  continue;
132  } else {
133  string_32 new_prefix_str32 = prefix_str32 + str_pref32;
134  string_32 new_str32 = str32.substr(len);
135  word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
136  WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
137  word_variants);
138  word_so_far->remove_last_unichar_id();
139  }
140  }
141  }
142 }
143 
144 // Compute all the variants of a 32-bit string in terms of the class-ids
145 // This is needed for languages that have ligatures. A word can then have more
146 // than one spelling in terms of the class-ids
148  const UNICHARSET *uchset, string_32 str32,
149  vector<WERD_CHOICE *> *word_variants) {
150  for (int i = 0; i < word_variants->size(); i++) {
151  delete (*word_variants)[i];
152  }
153  word_variants->clear();
154  string_32 prefix_str32;
155  WERD_CHOICE word_so_far(uchset);
156  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
157 }
158 
159 // add a new UTF-8 string to the lang model
160 bool WordListLangModel::AddString(const char *char_ptr) {
161  if (!init_ && !Init()) { // initialize if necessary
162  return false;
163  }
164 
165  string_32 str32;
166  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
167  if (str32.length() < 1) {
168  return false;
169  }
170  return AddString32(str32.c_str());
171 }
172 
173 // add a new UTF-32 string to the lang model
174 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
175  if (char_32_ptr == NULL) {
176  return false;
177  }
178  // get all the word variants
179  vector<WERD_CHOICE *> word_variants;
180  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
181  char_32_ptr, &word_variants);
182 
183  if (word_variants.size() > 0) {
184  // find the shortest variant
185  int shortest_word = 0;
186  for (int word = 1; word < word_variants.size(); word++) {
187  if (word_variants[shortest_word]->length() >
188  word_variants[word]->length()) {
189  shortest_word = word;
190  }
191  }
192  // only add the shortest grapheme interpretation of string to the word list
193  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
194  }
195  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
196  return true;
197 }
198 
199 }
inT64 EDGE_REF
Definition: dawg.h:54
const UNICHARSET * TessUnicharset() const
bool AddString32(const char_32 *char_32_ptr)
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE *> *word_variants)
void remove_last_unichar_id()
Definition: ratngs.h:481
int ClassCount() const
Definition: char_set.h:111
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
int length() const
Definition: ratngs.h:301
WordListLangModel(CubeRecoContext *cntxt)
CharSet * CharacterSet() const
bool AddString(const char *char_ptr)
signed int char_32
Definition: string_32.h:40
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256
int ClassID(const char_32 *str) const
Definition: char_set.h:54
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **edges)
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446