tesseract  3.05.02
shapetable.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.h
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Thu Oct 28 17:46:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
24 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
25 
26 #include "bitvector.h"
27 #include "fontinfo.h"
28 #include "genericheap.h"
29 #include "genericvector.h"
30 #include "intmatcher.h"
31 
32 class STRING;
33 class UNICHARSET;
34 
35 namespace tesseract {
36 
37 class ShapeTable;
38 
39 // Simple struct to hold a single classifier unichar selection, a corresponding
40 // rating, and a list of appropriate fonts.
41 struct UnicharRating {
43  : unichar_id(0), rating(0.0f), adapted(false), config(0),
44  feature_misses(0) {}
45  UnicharRating(int u, float r)
46  : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
47 
48  // Print debug info.
49  void Print() const {
50  tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%d,"
51  " %d fonts\n", unichar_id, rating, adapted, config, feature_misses,
52  fonts.size());
53  }
54 
55  // Sort function to sort ratings appropriately by descending rating.
56  static int SortDescendingRating(const void* t1, const void* t2) {
57  const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1);
58  const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2);
59  if (a->rating > b->rating) {
60  return -1;
61  } else if (a->rating < b->rating) {
62  return 1;
63  } else {
64  return a->unichar_id - b->unichar_id;
65  }
66  }
67  // Helper function to get the index of the first result with the required
68  // unichar_id. If the results are sorted by rating, this will also be the
69  // best result with the required unichar_id.
70  // Returns -1 if the unichar_id is not found
71  static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
73 
74  // Index into some UNICHARSET table indicates the class of the answer.
76  // Rating from classifier with 1.0 perfect and 0.0 impossible.
77  // Call it a probability if you must.
78  float rating;
79  // True if this result is from the adaptive classifier.
80  bool adapted;
81  // Index of best matching font configuration of result.
83  // Number of features that were total misses - were liked by no classes.
85  // Unsorted collection of fontinfo ids and scores. Note that a raw result
86  // from the IntegerMatch will contain config ids, that require transforming
87  // to fontinfo ids via fontsets and (possibly) shapetable.
89 };
90 
91 // Classifier result from a low-level classification is an index into some
92 // ShapeTable and a rating.
93 struct ShapeRating {
95  : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
96  joined(false), broken(false) {}
97  ShapeRating(int s, float r)
98  : shape_id(s), rating(r), raw(1.0f), font(0.0f),
99  joined(false), broken(false) {}
100 
101  // Sort function to sort ratings appropriately by descending rating.
102  static int SortDescendingRating(const void* t1, const void* t2) {
103  const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
104  const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
105  if (a->rating > b->rating) {
106  return -1;
107  } else if (a->rating < b->rating) {
108  return 1;
109  } else {
110  return a->shape_id - b->shape_id;
111  }
112  }
113  // Helper function to get the index of the first result with the required
114  // unichar_id. If the results are sorted by rating, this will also be the
115  // best result with the required unichar_id.
116  // Returns -1 if the unichar_id is not found
117  static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
118  const ShapeTable& shape_table,
119  UNICHAR_ID unichar_id);
120 
121  // Index into some shape table indicates the class of the answer.
122  int shape_id;
123  // Rating from classifier with 1.0 perfect and 0.0 impossible.
124  // Call it a probability if you must.
125  float rating;
126  // Subsidiary rating that a classifier may use internally.
127  float raw;
128  // Subsidiary rating that a classifier may use internally.
129  float font;
130  // Flag indicating that the input may be joined.
131  bool joined;
132  // Flag indicating that the input may be broken (a fragment).
133  bool broken;
134 };
135 
136 // Simple struct to hold an entry for a heap-based priority queue of
137 // ShapeRating.
140  ShapeQueueEntry(const ShapeRating& rating, int level0)
141  : result(rating), level(level0) {}
142 
143  // Sort by decreasing rating and decreasing level for equal rating.
144  bool operator<(const ShapeQueueEntry& other) const {
145  if (result.rating > other.result.rating) return true;
146  if (result.rating == other.result.rating)
147  return level > other.level;
148  return false;
149  }
150 
151  // Output from classifier.
153  // Which level in the tree did this come from?
154  int level;
155 };
157 
158 // Simple struct to hold a set of fonts associated with a single unichar-id.
159 // A vector of UnicharAndFonts makes a shape.
162  }
163  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
164  font_ids.push_back(font_id);
165  }
166 
167  // Writes to the given file. Returns false in case of error.
168  bool Serialize(FILE* fp) const;
169  // Reads from the given file. Returns false in case of error.
170  // If swap is true, assumes a big/little-endian swap is needed.
171  bool DeSerialize(bool swap, FILE* fp);
172 
173  // Sort function to sort a pair of UnicharAndFonts by unichar_id.
174  static int SortByUnicharId(const void* v1, const void* v2);
175 
178 };
179 
180 // A Shape is a collection of unichar-ids and a list of fonts associated with
181 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
182 // a classifiable unit, and represents a group of characters or parts of
183 // characters that have a similar or identical shape. Shapes/ShapeTables may
184 // be organized hierarchically from identical shapes at the leaves to vaguely
185 // similar shapes near the root.
186 class Shape {
187  public:
188  Shape() : destination_index_(-1) {}
189 
190  // Writes to the given file. Returns false in case of error.
191  bool Serialize(FILE* fp) const;
192  // Reads from the given file. Returns false in case of error.
193  // If swap is true, assumes a big/little-endian swap is needed.
194  bool DeSerialize(bool swap, FILE* fp);
195 
196  int destination_index() const {
197  return destination_index_;
198  }
199  void set_destination_index(int index) {
200  destination_index_ = index;
201  }
202  int size() const {
203  return unichars_.size();
204  }
205  // Returns a UnicharAndFonts entry for the given index, which must be
206  // in the range [0, size()).
207  const UnicharAndFonts& operator[](int index) const {
208  return unichars_[index];
209  }
210  // Sets the unichar_id of the given index to the new unichar_id.
211  void SetUnicharId(int index, int unichar_id) {
212  unichars_[index].unichar_id = unichar_id;
213  }
214  // Adds a font_id for the given unichar_id. If the unichar_id is not
215  // in the shape, it is added.
216  void AddToShape(int unichar_id, int font_id);
217  // Adds everything in other to this.
218  void AddShape(const Shape& other);
219  // Returns true if the shape contains the given unichar_id, font_id pair.
220  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
221  // Returns true if the shape contains the given unichar_id, ignoring font.
222  bool ContainsUnichar(int unichar_id) const;
223  // Returns true if the shape contains the given font, ignoring unichar_id.
224  bool ContainsFont(int font_id) const;
225  // Returns true if the shape contains the given font properties, ignoring
226  // unichar_id.
227  bool ContainsFontProperties(const FontInfoTable& font_table,
228  uinT32 properties) const;
229  // Returns true if the shape contains multiple different font properties,
230  // ignoring unichar_id.
231  bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
232  // Returns true if this shape is equal to other (ignoring order of unichars
233  // and fonts).
234  bool operator==(const Shape& other) const;
235  // Returns true if this is a subset (including equal) of other.
236  bool IsSubsetOf(const Shape& other) const;
237  // Returns true if the lists of unichar ids are the same in this and other,
238  // ignoring fonts.
239  // NOT const, as it will sort the unichars on demand.
240  bool IsEqualUnichars(Shape* other);
241 
242  private:
243  // Sorts the unichars_ vector by unichar.
244  void SortUnichars();
245 
246  // Flag indicates that the unichars are sorted, allowing faster set
247  // operations with another shape.
248  bool unichars_sorted_;
249  // If this Shape is part of a ShapeTable the destiation_index_ is the index
250  // of some other shape in the ShapeTable with which this shape is merged.
251  int destination_index_;
252  // Array of unichars, each with a set of fonts. Each unichar has at most
253  // one entry in the vector.
255 };
256 
257 // ShapeTable is a class to encapsulate the triple indirection that is
258 // used here.
259 // ShapeTable is a vector of shapes.
260 // Each shape is a vector of UnicharAndFonts representing the set of unichars
261 // that the shape represents.
262 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
263 // mapped to the shape during training.
265  public:
266  ShapeTable();
267  // The UNICHARSET reference supplied here, or in set_unicharset below must
268  // exist for the entire life of the ShapeTable. It is used only by DebugStr.
269  explicit ShapeTable(const UNICHARSET& unicharset);
270 
271  // Writes to the given file. Returns false in case of error.
272  bool Serialize(FILE* fp) const;
273  // Reads from the given file. Returns false in case of error.
274  // If swap is true, assumes a big/little-endian swap is needed.
275  bool DeSerialize(bool swap, FILE* fp);
276 
277  // Accessors.
278  int NumShapes() const {
279  return shape_table_.size();
280  }
281  const UNICHARSET& unicharset() const {
282  return *unicharset_;
283  }
284  // Returns the number of fonts used in this ShapeTable, computing it if
285  // necessary.
286  int NumFonts() const;
287  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
288  // entire life of the ShapeTable.
289  void set_unicharset(const UNICHARSET& unicharset) {
290  unicharset_ = &unicharset;
291  }
292  // Re-indexes the class_ids in the shapetable according to the given map.
293  // Useful in conjunction with set_unicharset.
294  void ReMapClassIds(const GenericVector<int>& unicharset_map);
295  // Returns a string listing the classes/fonts in a shape.
296  STRING DebugStr(int shape_id) const;
297  // Returns a debug string summarizing the table.
298  STRING SummaryStr() const;
299 
300  // Adds a new shape starting with the given unichar_id and font_id.
301  // Returns the assigned index.
302  int AddShape(int unichar_id, int font_id);
303  // Adds a copy of the given shape unless it is already present.
304  // Returns the assigned index or index of existing shape if already present.
305  int AddShape(const Shape& other);
306  // Removes the shape given by the shape index. All indices above are changed!
307  void DeleteShape(int shape_id);
308  // Adds a font_id to the given existing shape index for the given
309  // unichar_id. If the unichar_id is not in the shape, it is added.
310  void AddToShape(int shape_id, int unichar_id, int font_id);
311  // Adds the given shape to the existing shape with the given index.
312  void AddShapeToShape(int shape_id, const Shape& other);
313  // Returns the id of the shape that contains the given unichar and font.
314  // If not found, returns -1.
315  // If font_id < 0, the font_id is ignored and the first shape that matches
316  // the unichar_id is returned.
317  int FindShape(int unichar_id, int font_id) const;
318  // Returns the first unichar_id and font_id in the given shape.
319  void GetFirstUnicharAndFont(int shape_id,
320  int* unichar_id, int* font_id) const;
321 
322  // Accessors for the Shape with the given shape_id.
323  const Shape& GetShape(int shape_id) const {
324  return *shape_table_[shape_id];
325  }
326  Shape* MutableShape(int shape_id) {
327  return shape_table_[shape_id];
328  }
329 
330  // Expands all the classes/fonts in the shape individually to build
331  // a ShapeTable.
332  int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
333 
334  // Returns true if the shapes are already merged.
335  bool AlreadyMerged(int shape_id1, int shape_id2) const;
336  // Returns true if any shape contains multiple unichars.
337  bool AnyMultipleUnichars() const;
338  // Returns the maximum number of unichars over all shapes.
339  int MaxNumUnichars() const;
340  // Merges shapes with a common unichar over the [start, end) interval.
341  // Assumes single unichar per shape.
342  void ForceFontMerges(int start, int end);
343  // Returns the number of unichars in the master shape.
344  int MasterUnicharCount(int shape_id) const;
345  // Returns the sum of the font counts in the master shape.
346  int MasterFontCount(int shape_id) const;
347  // Returns the number of unichars that would result from merging the shapes.
348  int MergedUnicharCount(int shape_id1, int shape_id2) const;
349  // Merges two shape_ids, leaving shape_id2 marked as merged.
350  void MergeShapes(int shape_id1, int shape_id2);
351  // Swaps two shape_ids.
352  void SwapShapes(int shape_id1, int shape_id2);
353  // Appends the master shapes from other to this.
354  // Used to create a clean ShapeTable from a merged one, or to create a
355  // copy of a ShapeTable.
356  // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
357  void AppendMasterShapes(const ShapeTable& other,
358  GenericVector<int>* shape_map);
359  // Returns the number of master shapes remaining after merging.
360  int NumMasterShapes() const;
361  // Returns the destination of this shape, (if merged), taking into account
362  // the fact that the destination may itself have been merged.
363  // For a non-merged shape, returns the input shape_id.
364  int MasterDestinationIndex(int shape_id) const;
365 
366  // Returns false if the unichars in neither shape is a subset of the other..
367  bool SubsetUnichar(int shape_id1, int shape_id2) const;
368  // Returns false if the unichars in neither shape is a subset of the other..
369  bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
370  // Returns true if the unichar sets are equal between the shapes.
371  bool EqualUnichars(int shape_id1, int shape_id2) const;
372  bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
373  // Returns true if there is a common unichar between the shapes.
374  bool CommonUnichars(int shape_id1, int shape_id2) const;
375  // Returns true if there is a common font id between the shapes.
376  bool CommonFont(int shape_id1, int shape_id2) const;
377 
378  // Adds the unichars of the given shape_id to the vector of results. Any
379  // unichar_id that is already present just has the fonts added to the
380  // font set for that result without adding a new entry in the vector.
381  // NOTE: it is assumed that the results are given to this function in order
382  // of decreasing rating.
383  // The unichar_map vector indicates the index of the results entry containing
384  // each unichar, or -1 if the unichar is not yet included in results.
385  void AddShapeToResults(const ShapeRating& shape_rating,
386  GenericVector<int>* unichar_map,
387  GenericVector<UnicharRating>* results) const;
388 
389  private:
390  // Adds the given unichar_id to the results if needed, updating unichar_map
391  // and returning the index of unichar in results.
392  int AddUnicharToResults(int unichar_id, float rating,
393  GenericVector<int>* unichar_map,
394  GenericVector<UnicharRating>* results) const;
395 
396  // Pointer to a provided unicharset used only by the Debugstr member.
397  const UNICHARSET* unicharset_;
398  // Vector of pointers to the Shapes in this ShapeTable.
399  PointerVector<Shape> shape_table_;
400 
401  // Cached data calculated on demand.
402  mutable int num_fonts_;
403 };
404 
405 } // namespace tesseract.
406 
407 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
static int FirstResultWithUnichar(const GenericVector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
static int FirstResultWithUnichar(const GenericVector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:56
int NumShapes() const
Definition: shapetable.h:278
bool operator==(const Shape &other) const
Definition: shapetable.cpp:206
bool operator<(const ShapeQueueEntry &other) const
Definition: shapetable.h:144
UnicharRating(int u, float r)
Definition: shapetable.h:45
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:166
bool ContainsFontProperties(const FontInfoTable &font_table, uinT32 properties) const
Definition: shapetable.cpp:178
void AddShape(const Shape &other)
Definition: shapetable.cpp:129
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:74
#define TESS_API
Definition: platform.h:81
const UnicharAndFonts & operator[](int index) const
Definition: shapetable.h:207
GenericVector< inT32 > font_ids
Definition: shapetable.h:176
GenericVector< ScoredFont > fonts
Definition: shapetable.h:88
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:156
unsigned char uinT8
Definition: host.h:32
GenericHeap< ShapeQueueEntry > ShapeQueue
Definition: shapetable.h:156
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:67
ShapeRating(int s, float r)
Definition: shapetable.h:97
const UNICHARSET & unicharset() const
Definition: shapetable.h:281
int push_back(T object)
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:102
void set_destination_index(int index)
Definition: shapetable.h:199
unsigned short uinT16
Definition: host.h:34
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:226
ShapeQueueEntry(const ShapeRating &rating, int level0)
Definition: shapetable.h:140
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:90
void set_unicharset(const UNICHARSET &unicharset)
Definition: shapetable.h:289
Shape * MutableShape(int shape_id)
Definition: shapetable.h:326
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:99
Definition: strngs.h:44
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:211
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:191
int size() const
Definition: shapetable.h:202
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:211
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:110
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:323
unsigned int uinT32
Definition: host.h:36
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:140
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
UnicharAndFonts(int uni_id, int font_id)
Definition: shapetable.h:163
int destination_index() const
Definition: shapetable.h:196
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:83
int UNICHAR_ID
Definition: unichar.h:33