tesseract  3.05.02
dawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.c (Formerly dawg.c)
5  * Description: Use a Directed Accyclic Word Graph
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #ifdef _MSC_VER
30 #pragma warning(disable:4244) // Conversion warnings
31 #pragma warning(disable:4800) // int/bool warnings
32 #endif
33 #include "dawg.h"
34 
35 #include "cutil.h"
36 #include "dict.h"
37 #include "emalloc.h"
38 #include "helpers.h"
39 #include "strngs.h"
40 #include "tesscallback.h"
41 #include "tprintf.h"
42 
43 /*----------------------------------------------------------------------
44  F u n c t i o n s f o r D a w g
45 ----------------------------------------------------------------------*/
46 namespace tesseract {
47 
49  bool requires_complete) const {
50  if (word.length() == 0) return !requires_complete;
51  NODE_REF node = 0;
52  int end_index = word.length() - 1;
53  for (int i = 0; i < end_index; i++) {
54  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
55  if (edge == NO_EDGE) {
56  return false;
57  }
58  if ((node = next_node(edge)) == 0) {
59  // This only happens if all words following this edge terminate --
60  // there are no larger words. See Trie::add_word_to_dawg()
61  return false;
62  }
63  }
64  // Now check the last character.
65  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
66  NO_EDGE;
67 }
68 
69 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
70  return prefix_in_dawg(word, true);
71 }
72 
74  const UNICHARSET &unicharset,
75  bool enable_wildcard) const {
76  if (filename == NULL) return 0;
77 
78  FILE *word_file;
79  char string [CHARS_PER_LINE];
80  int misses = 0;
81  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
82 
83  word_file = open_file (filename, "r");
84 
85  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
86  chomp_string(string); // remove newline
87  WERD_CHOICE word(string, unicharset);
88  if (word.length() > 0 &&
89  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
90  if (!match_words(&word, 0, 0,
91  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
92  tprintf("Missing word: %s\n", string);
93  ++misses;
94  }
95  } else {
96  tprintf("Failed to create a valid word from %s\n", string);
97  }
98  }
99  fclose (word_file);
100  // Make sure the user sees this with fprintf instead of tprintf.
101  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
102  return misses;
103 }
104 
105 void Dawg::iterate_words(const UNICHARSET &unicharset,
107  WERD_CHOICE word(&unicharset);
108  iterate_words_rec(word, 0, cb);
109 }
110 
112  STRING s;
113  wc->string_and_lengths(&s, NULL);
114  cb->Run(s.string());
115 }
116 
117 void Dawg::iterate_words(const UNICHARSET &unicharset,
118  TessCallback1<const char *> *cb) const {
121  WERD_CHOICE word(&unicharset);
122  iterate_words_rec(word, 0, shim);
123  delete shim;
124 }
125 
126 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
127  NODE_REF to_explore,
129  NodeChildVector children;
130  this->unichar_ids_of(to_explore, &children, false);
131  for (int i = 0; i < children.size(); i++) {
132  WERD_CHOICE next_word(word_so_far);
133  next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
134  if (this->end_of_word(children[i].edge_ref)) {
135  cb->Run(&next_word);
136  }
137  NODE_REF next = next_node(children[i].edge_ref);
138  if (next != 0) {
139  iterate_words_rec(next_word, next, cb);
140  }
141  }
142 }
143 
145  NODE_REF node, UNICHAR_ID wildcard) const {
146  EDGE_REF edge;
147  inT32 word_end;
148 
149  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
150  bool any_matched = false;
151  NodeChildVector vec;
152  this->unichar_ids_of(node, &vec, false);
153  for (int i = 0; i < vec.size(); ++i) {
154  word->set_unichar_id(vec[i].unichar_id, index);
155  if (match_words(word, index, node, wildcard))
156  any_matched = true;
157  }
158  word->set_unichar_id(wildcard, index);
159  return any_matched;
160  } else {
161  word_end = index == word->length() - 1;
162  edge = edge_char_of(node, word->unichar_id(index), word_end);
163  if (edge != NO_EDGE) { // normal edge in DAWG
164  node = next_node(edge);
165  if (word_end) {
166  if (debug_level_ > 1) word->print("match_words() found: ");
167  return true;
168  } else if (node != 0) {
169  return match_words(word, index+1, node, wildcard);
170  }
171  }
172  }
173  return false;
174 }
175 
176 void Dawg::init(DawgType type, const STRING &lang,
177  PermuterType perm, int unicharset_size, int debug_level) {
178  type_ = type;
179  lang_ = lang;
180  perm_ = perm;
181  ASSERT_HOST(unicharset_size > 0);
182  unicharset_size_ = unicharset_size;
183  // Set bit masks. We will use the value unicharset_size_ as a null char, so
184  // the actual number of unichars is unicharset_size_ + 1.
185  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
187  letter_mask_ = ~(~0ull << flag_start_bit_);
190 
191  debug_level_ = debug_level;
192 }
193 
194 
195 /*----------------------------------------------------------------------
196  F u n c t i o n s f o r S q u i s h e d D a w g
197 ----------------------------------------------------------------------*/
198 
199 SquishedDawg::~SquishedDawg() { delete[] edges_; }
200 
202  UNICHAR_ID unichar_id,
203  bool word_end) const {
204  EDGE_REF edge = node;
205  if (node == 0) { // binary search
206  EDGE_REF start = 0;
207  EDGE_REF end = num_forward_edges_in_node0 - 1;
208  int compare;
209  while (start <= end) {
210  edge = (start + end) >> 1; // (start + end) / 2
211  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
212  unichar_id, edges_[edge]);
213  if (compare == 0) { // given == vec[k]
214  return edge;
215  } else if (compare == 1) { // given > vec[k]
216  start = edge + 1;
217  } else { // given < vec[k]
218  end = edge - 1;
219  }
220  }
221  } else { // linear search
222  if (edge != NO_EDGE && edge_occupied(edge)) {
223  do {
224  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
225  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
226  return (edge);
227  } while (!last_edge(edge++));
228  }
229  }
230  return (NO_EDGE); // not found
231 }
232 
233 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const {
234  EDGE_REF edge = node;
235  inT32 num = 0;
236 
237  if (forward_edge (edge)) {
238  do {
239  num++;
240  } while (!last_edge(edge++));
241  }
242 
243  return (num);
244 }
245 
246 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
247  if (node == NO_EDGE) return; // nothing to print
248 
249  EDGE_REF edge = node;
250  const char *forward_string = "FORWARD";
251  const char *backward_string = " ";
252 
253  const char *last_string = "LAST";
254  const char *not_last_string = " ";
255 
256  const char *eow_string = "EOW";
257  const char *not_eow_string = " ";
258 
259  const char *direction;
260  const char *is_last;
261  const char *eow;
262 
263  UNICHAR_ID unichar_id;
264 
265  if (edge_occupied(edge)) {
266  do {
267  direction =
268  forward_edge(edge) ? forward_string : backward_string;
269  is_last = last_edge(edge) ? last_string : not_last_string;
270  eow = end_of_word(edge) ? eow_string : not_eow_string;
271 
272  unichar_id = edge_letter(edge);
273  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
274  edge, next_node(edge), unichar_id,
275  direction, is_last, eow);
276 
277  if (edge - node > max_num_edges) return;
278  } while (!last_edge(edge++));
279 
280  if (edge < num_edges_ &&
281  edge_occupied(edge) && backward_edge(edge)) {
282  do {
283  direction =
284  forward_edge(edge) ? forward_string : backward_string;
285  is_last = last_edge(edge) ? last_string : not_last_string;
286  eow = end_of_word(edge) ? eow_string : not_eow_string;
287 
288  unichar_id = edge_letter(edge);
289  tprintf(REFFORMAT " : next = " REFFORMAT
290  ", unichar_id = %d, %s %s %s\n",
291  edge, next_node(edge), unichar_id,
292  direction, is_last, eow);
293 
294  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
295  } while (!last_edge(edge++));
296  }
297  }
298  else {
299  tprintf(REFFORMAT " : no edges in this node\n", node);
300  }
301  tprintf("\n");
302 }
303 
304 void SquishedDawg::print_edge(EDGE_REF edge) const {
305  if (edge == NO_EDGE) {
306  tprintf("NO_EDGE\n");
307  } else {
308  tprintf(REFFORMAT " : next = " REFFORMAT
309  ", unichar_id = '%d', %s %s %s\n", edge,
310  next_node(edge), edge_letter(edge),
311  (forward_edge(edge) ? "FORWARD" : " "),
312  (last_edge(edge) ? "LAST" : " "),
313  (end_of_word(edge) ? "EOW" : ""));
314  }
315 }
316 
317 void SquishedDawg::read_squished_dawg(FILE *file,
318  DawgType type,
319  const STRING &lang,
320  PermuterType perm,
321  int debug_level) {
322  if (debug_level) tprintf("Reading squished dawg\n");
323 
324  // Read the magic number and if it does not match kDawgMagicNumber
325  // set swap to true to indicate that we need to switch endianness.
326  inT16 magic;
327  fread(&magic, sizeof(inT16), 1, file);
328  bool swap = (magic != kDawgMagicNumber);
329 
330  int unicharset_size;
331  fread(&unicharset_size, sizeof(inT32), 1, file);
332  fread(&num_edges_, sizeof(inT32), 1, file);
333 
334  if (swap) {
335  ReverseN(&unicharset_size, sizeof(unicharset_size));
336  ReverseN(&num_edges_, sizeof(num_edges_));
337  }
338  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
339  Dawg::init(type, lang, perm, unicharset_size, debug_level);
340 
341  edges_ = new EDGE_RECORD[num_edges_];
342  fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
343  EDGE_REF edge;
344  if (swap) {
345  for (edge = 0; edge < num_edges_; ++edge) {
346  ReverseN(&edges_[edge], sizeof(edges_[edge]));
347  }
348  }
349  if (debug_level > 2) {
350  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
351  type_, lang_.string(), perm_, unicharset_size_, num_edges_);
352  for (edge = 0; edge < num_edges_; ++edge)
353  print_edge(edge);
354  }
355 }
356 
357 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
358  EDGE_REF edge;
359  NODE_MAP node_map;
360  inT32 node_counter;
361  inT32 num_edges;
362 
363  node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
364 
365  for (edge = 0; edge < num_edges_; edge++) // init all slots
366  node_map [edge] = -1;
367 
368  node_counter = num_forward_edges(0);
369 
370  *num_nodes = 0;
371  for (edge = 0; edge < num_edges_; edge++) { // search all slots
372 
373  if (forward_edge(edge)) {
374  (*num_nodes)++; // count nodes links
375  node_map[edge] = (edge ? node_counter : 0);
376  num_edges = num_forward_edges(edge);
377  if (edge != 0) node_counter += num_edges;
378  edge += num_edges;
379  if (edge >= num_edges_) break;
380  if (backward_edge(edge)) while (!last_edge(edge++));
381  edge--;
382  }
383  }
384  return (node_map);
385 }
386 
388  EDGE_REF edge;
389  inT32 num_edges;
390  inT32 node_count = 0;
391  NODE_MAP node_map;
392  EDGE_REF old_index;
393  EDGE_RECORD temp_record;
394 
395  if (debug_level_) tprintf("write_squished_dawg\n");
396 
397  node_map = build_node_map(&node_count);
398 
399  // Write the magic number to help detecting a change in endianness.
400  inT16 magic = kDawgMagicNumber;
401  fwrite(&magic, sizeof(inT16), 1, file);
402  fwrite(&unicharset_size_, sizeof(inT32), 1, file);
403 
404  // Count the number of edges in this Dawg.
405  num_edges = 0;
406  for (edge=0; edge < num_edges_; edge++)
407  if (forward_edge(edge))
408  num_edges++;
409 
410  fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file
411 
412  if (debug_level_) {
413  tprintf("%d nodes in DAWG\n", node_count);
414  tprintf("%d edges in DAWG\n", num_edges);
415  }
416 
417  for (edge = 0; edge < num_edges_; edge++) {
418  if (forward_edge(edge)) { // write forward edges
419  do {
420  old_index = next_node_from_edge_rec(edges_[edge]);
421  set_next_node(edge, node_map[old_index]);
422  temp_record = edges_[edge];
423  fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
424  set_next_node(edge, old_index);
425  } while (!last_edge(edge++));
426 
427  if (edge >= num_edges_) break;
428  if (backward_edge(edge)) // skip back links
429  while (!last_edge(edge++));
430 
431  edge--;
432  }
433  }
434  free(node_map);
435 }
436 
437 } // namespace tesseract
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:48
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
uinT64 letter_mask_
Definition: dawg.h:309
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:201
uinT64 flags_mask_
Definition: dawg.h:308
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
virtual void Run(A1)=0
inT64 EDGE_REF
Definition: dawg.h:54
short inT16
Definition: host.h:33
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:458
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
inT64 NODE_REF
Definition: dawg.h:55
virtual bool end_of_word(EDGE_REF edge_ref) const =0
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
uinT64 next_node_mask_
Definition: dawg.h:307
void print() const
Definition: ratngs.h:564
PermuterType
Definition: ratngs.h:240
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
int next_node_start_bit_
Definition: dawg.h:306
DawgType
Definition: dawg.h:71
void write_squished_dawg(FILE *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:387
void print_node(NODE_REF node, int max_num_edges) const
Definition: dawg.cpp:246
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:304
static const inT16 kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:121
void CallWithUTF8(TessCallback1< const char *> *cb, const WERD_CHOICE *wc)
Definition: dawg.cpp:111
EDGE_REF * NODE_MAP
Definition: dawg.h:56
bool match_words(WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:144
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:245
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:299
STRING lang_
Definition: dawg.h:297
DawgType type() const
Definition: dawg.h:127
int unicharset_size_
Definition: dawg.h:304
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:469
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:73
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
const char * string() const
Definition: strngs.cpp:201
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
#define NUM_FLAG_BITS
Definition: dawg.h:91
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
Definition: dawg.cpp:105
DawgType type_
Definition: dawg.h:296
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int length() const
Definition: ratngs.h:301
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
int debug_level_
Definition: dawg.h:311
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int inT32
Definition: host.h:35
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
#define REFFORMAT
Definition: dawg.h:92
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:464
#define CHARS_PER_LINE
Definition: cutil.h:57
int flag_start_bit_
Definition: dawg.h:305
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
Definition: dawg.cpp:126
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
void init(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: dawg.cpp:176
const STRING & lang() const
Definition: dawg.h:128
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
void chomp_string(char *str)
Definition: helpers.h:75
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:50
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int UNICHAR_ID
Definition: unichar.h:33