tesseract  3.05.02
shapetable.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.cpp
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Tue Nov 02 15:31:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #include "shapetable.h"
24 
25 #include "bitvector.h"
26 #include "fontinfo.h"
27 #include "intfeaturespace.h"
28 #include "strngs.h"
29 #include "unicharset.h"
30 #include "unicity_table.h"
31 
32 namespace tesseract {
33 
34 // Helper function to get the index of the first result with the required
35 // unichar_id. If the results are sorted by rating, this will also be the
36 // best result with the required unichar_id.
37 // Returns -1 if the unichar_id is not found
39  const GenericVector<ShapeRating>& results,
40  const ShapeTable& shape_table,
41  UNICHAR_ID unichar_id) {
42  for (int r = 0; r < results.size(); ++r) {
43  int shape_id = results[r].shape_id;
44  const Shape& shape = shape_table.GetShape(shape_id);
45  if (shape.ContainsUnichar(unichar_id)) {
46  return r;
47  }
48  }
49  return -1;
50 }
51 
52 // Helper function to get the index of the first result with the required
53 // unichar_id. If the results are sorted by rating, this will also be the
54 // best result with the required unichar_id.
55 // Returns -1 if the unichar_id is not found
57  const GenericVector<UnicharRating>& results,
58  UNICHAR_ID unichar_id) {
59  for (int r = 0; r < results.size(); ++r) {
60  if (results[r].unichar_id == unichar_id)
61  return r;
62  }
63  return -1;
64 }
65 
66 // Writes to the given file. Returns false in case of error.
67 bool UnicharAndFonts::Serialize(FILE* fp) const {
68  if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
69  if (!font_ids.Serialize(fp)) return false;
70  return true;
71 }
72 // Reads from the given file. Returns false in case of error.
73 // If swap is true, assumes a big/little-endian swap is needed.
74 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
75  if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
76  if (swap)
77  ReverseN(&unichar_id, sizeof(unichar_id));
78  if (!font_ids.DeSerialize(swap, fp)) return false;
79  return true;
80 }
81 
82 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
83 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
84  const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
85  const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
86  return p1->unichar_id - p2->unichar_id;
87 }
88 
89 // Writes to the given file. Returns false in case of error.
90 bool Shape::Serialize(FILE* fp) const {
91  uinT8 sorted = unichars_sorted_;
92  if (fwrite(&sorted, sizeof(sorted), 1, fp) != 1)
93  return false;
94  if (!unichars_.SerializeClasses(fp)) return false;
95  return true;
96 }
97 // Reads from the given file. Returns false in case of error.
98 // If swap is true, assumes a big/little-endian swap is needed.
99 bool Shape::DeSerialize(bool swap, FILE* fp) {
100  uinT8 sorted;
101  if (fread(&sorted, sizeof(sorted), 1, fp) != 1)
102  return false;
103  unichars_sorted_ = sorted != 0;
104  if (!unichars_.DeSerializeClasses(swap, fp)) return false;
105  return true;
106 }
107 
108 // Adds a font_id for the given unichar_id. If the unichar_id is not
109 // in the shape, it is added.
110 void Shape::AddToShape(int unichar_id, int font_id) {
111  for (int c = 0; c < unichars_.size(); ++c) {
112  if (unichars_[c].unichar_id == unichar_id) {
113  // Found the unichar in the shape table.
114  GenericVector<int>& font_list = unichars_[c].font_ids;
115  for (int f = 0; f < font_list.size(); ++f) {
116  if (font_list[f] == font_id)
117  return; // Font is already there.
118  }
119  font_list.push_back(font_id);
120  return;
121  }
122  }
123  // Unichar_id is not in shape, so add it to shape.
124  unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
125  unichars_sorted_ = unichars_.size() <= 1;
126 }
127 
128 // Adds everything in other to this.
129 void Shape::AddShape(const Shape& other) {
130  for (int c = 0; c < other.unichars_.size(); ++c) {
131  for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
132  AddToShape(other.unichars_[c].unichar_id,
133  other.unichars_[c].font_ids[f]);
134  }
135  }
136  unichars_sorted_ = unichars_.size() <= 1;
137 }
138 
139 // Returns true if the shape contains the given unichar_id, font_id pair.
140 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
141  for (int c = 0; c < unichars_.size(); ++c) {
142  if (unichars_[c].unichar_id == unichar_id) {
143  // Found the unichar, so look for the font.
144  GenericVector<int>& font_list = unichars_[c].font_ids;
145  for (int f = 0; f < font_list.size(); ++f) {
146  if (font_list[f] == font_id)
147  return true;
148  }
149  return false;
150  }
151  }
152  return false;
153 }
154 
155 // Returns true if the shape contains the given unichar_id, ignoring font.
156 bool Shape::ContainsUnichar(int unichar_id) const {
157  for (int c = 0; c < unichars_.size(); ++c) {
158  if (unichars_[c].unichar_id == unichar_id) {
159  return true;
160  }
161  }
162  return false;
163 }
164 
165 // Returns true if the shape contains the given font, ignoring unichar_id.
166 bool Shape::ContainsFont(int font_id) const {
167  for (int c = 0; c < unichars_.size(); ++c) {
168  GenericVector<int>& font_list = unichars_[c].font_ids;
169  for (int f = 0; f < font_list.size(); ++f) {
170  if (font_list[f] == font_id)
171  return true;
172  }
173  }
174  return false;
175 }
176 // Returns true if the shape contains the given font properties, ignoring
177 // unichar_id.
179  uinT32 properties) const {
180  for (int c = 0; c < unichars_.size(); ++c) {
181  GenericVector<int>& font_list = unichars_[c].font_ids;
182  for (int f = 0; f < font_list.size(); ++f) {
183  if (font_table.get(font_list[f]).properties == properties)
184  return true;
185  }
186  }
187  return false;
188 }
189 // Returns true if the shape contains multiple different font properties,
190 // ignoring unichar_id.
192  const FontInfoTable& font_table) const {
193  uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties;
194  for (int c = 0; c < unichars_.size(); ++c) {
195  GenericVector<int>& font_list = unichars_[c].font_ids;
196  for (int f = 0; f < font_list.size(); ++f) {
197  if (font_table.get(font_list[f]).properties != properties)
198  return true;
199  }
200  }
201  return false;
202 }
203 
204 // Returns true if this shape is equal to other (ignoring order of unichars
205 // and fonts).
206 bool Shape::operator==(const Shape& other) const {
207  return IsSubsetOf(other) && other.IsSubsetOf(*this);
208 }
209 
210 // Returns true if this is a subset (including equal) of other.
211 bool Shape::IsSubsetOf(const Shape& other) const {
212  for (int c = 0; c < unichars_.size(); ++c) {
213  int unichar_id = unichars_[c].unichar_id;
214  const GenericVector<int>& font_list = unichars_[c].font_ids;
215  for (int f = 0; f < font_list.size(); ++f) {
216  if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
217  return false;
218  }
219  }
220  return true;
221 }
222 
223 // Returns true if the lists of unichar ids are the same in this and other,
224 // ignoring fonts.
225 // NOT const, as it will sort the unichars on demand.
227  if (unichars_.size() != other->unichars_.size()) return false;
228  if (!unichars_sorted_) SortUnichars();
229  if (!other->unichars_sorted_) other->SortUnichars();
230  for (int c = 0; c < unichars_.size(); ++c) {
231  if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
232  return false;
233  }
234  return true;
235 }
236 
237 // Sorts the unichars_ vector by unichar.
238 void Shape::SortUnichars() {
239  unichars_.sort(UnicharAndFonts::SortByUnicharId);
240  unichars_sorted_ = true;
241 }
242 
243 ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) {
244 }
246  : unicharset_(&unicharset), num_fonts_(0) {
247 }
248 
249 // Writes to the given file. Returns false in case of error.
250 bool ShapeTable::Serialize(FILE* fp) const {
251  if (!shape_table_.Serialize(fp)) return false;
252  return true;
253 }
254 // Reads from the given file. Returns false in case of error.
255 // If swap is true, assumes a big/little-endian swap is needed.
256 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
257  if (!shape_table_.DeSerialize(swap, fp)) return false;
258  num_fonts_ = 0;
259  return true;
260 }
261 
262 // Returns the number of fonts used in this ShapeTable, computing it if
263 // necessary.
264 int ShapeTable::NumFonts() const {
265  if (num_fonts_ <= 0) {
266  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
267  const Shape& shape = *shape_table_[shape_id];
268  for (int c = 0; c < shape.size(); ++c) {
269  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
270  if (shape[c].font_ids[f] >= num_fonts_)
271  num_fonts_ = shape[c].font_ids[f] + 1;
272  }
273  }
274  }
275  }
276  return num_fonts_;
277 }
278 
279 // Re-indexes the class_ids in the shapetable according to the given map.
280 // Useful in conjunction with set_unicharset.
281 void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
282  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
283  Shape* shape = shape_table_[shape_id];
284  for (int c = 0; c < shape->size(); ++c) {
285  shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
286  }
287  }
288 }
289 
290 // Returns a string listing the classes/fonts in a shape.
291 STRING ShapeTable::DebugStr(int shape_id) const {
292  if (shape_id < 0 || shape_id >= shape_table_.size())
293  return STRING("INVALID_UNICHAR_ID");
294  const Shape& shape = GetShape(shape_id);
295  STRING result;
296  result.add_str_int("Shape", shape_id);
297  if (shape.size() > 100) {
298  result.add_str_int(" Num unichars=", shape.size());
299  return result;
300  }
301  for (int c = 0; c < shape.size(); ++c) {
302  result.add_str_int(" c_id=", shape[c].unichar_id);
303  result += "=";
304  result += unicharset_->id_to_unichar(shape[c].unichar_id);
305  if (shape.size() < 10) {
306  result.add_str_int(", ", shape[c].font_ids.size());
307  result += " fonts =";
308  int num_fonts = shape[c].font_ids.size();
309  if (num_fonts > 10) {
310  result.add_str_int(" ", shape[c].font_ids[0]);
311  result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
312  } else {
313  for (int f = 0; f < num_fonts; ++f) {
314  result.add_str_int(" ", shape[c].font_ids[f]);
315  }
316  }
317  }
318  }
319  return result;
320 }
321 
322 // Returns a debug string summarizing the table.
324  int max_unichars = 0;
325  int num_multi_shapes = 0;
326  int num_master_shapes = 0;
327  for (int s = 0; s < shape_table_.size(); ++s) {
328  if (MasterDestinationIndex(s) != s) continue;
329  ++num_master_shapes;
330  int shape_size = GetShape(s).size();
331  if (shape_size > 1)
332  ++num_multi_shapes;
333  if (shape_size > max_unichars)
334  max_unichars = shape_size;
335  }
336  STRING result;
337  result.add_str_int("Number of shapes = ", num_master_shapes);
338  result.add_str_int(" max unichars = ", max_unichars);
339  result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
340  return result;
341 }
342 
343 
344 // Adds a new shape starting with the given unichar_id and font_id.
345 // Returns the assigned index.
346 int ShapeTable::AddShape(int unichar_id, int font_id) {
347  int index = shape_table_.size();
348  Shape* shape = new Shape;
349  shape->AddToShape(unichar_id, font_id);
350  shape_table_.push_back(shape);
351  num_fonts_ = MAX(num_fonts_, font_id + 1);
352  return index;
353 }
354 
355 // Adds a copy of the given shape unless it is already present.
356 // Returns the assigned index or index of existing shape if already present.
357 int ShapeTable::AddShape(const Shape& other) {
358  int index;
359  for (index = 0; index < shape_table_.size() &&
360  !(other == *shape_table_[index]); ++index)
361  continue;
362  if (index == shape_table_.size()) {
363  Shape* shape = new Shape(other);
364  shape_table_.push_back(shape);
365  }
366  num_fonts_ = 0;
367  return index;
368 }
369 
370 // Removes the shape given by the shape index.
371 void ShapeTable::DeleteShape(int shape_id) {
372  delete shape_table_[shape_id];
373  shape_table_[shape_id] = NULL;
374  shape_table_.remove(shape_id);
375 }
376 
377 // Adds a font_id to the given existing shape index for the given
378 // unichar_id. If the unichar_id is not in the shape, it is added.
379 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
380  Shape& shape = *shape_table_[shape_id];
381  shape.AddToShape(unichar_id, font_id);
382  num_fonts_ = MAX(num_fonts_, font_id + 1);
383 }
384 
385 // Adds the given shape to the existing shape with the given index.
386 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
387  Shape& shape = *shape_table_[shape_id];
388  shape.AddShape(other);
389  num_fonts_ = 0;
390 }
391 
392 // Returns the id of the shape that contains the given unichar and font.
393 // If not found, returns -1.
394 // If font_id < 0, the font_id is ignored and the first shape that matches
395 // the unichar_id is returned.
396 int ShapeTable::FindShape(int unichar_id, int font_id) const {
397  for (int s = 0; s < shape_table_.size(); ++s) {
398  const Shape& shape = GetShape(s);
399  for (int c = 0; c < shape.size(); ++c) {
400  if (shape[c].unichar_id == unichar_id) {
401  if (font_id < 0)
402  return s; // We don't care about the font.
403  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
404  if (shape[c].font_ids[f] == font_id)
405  return s;
406  }
407  }
408  }
409  }
410  return -1;
411 }
412 
413 // Returns the first unichar_id and font_id in the given shape.
415  int* unichar_id, int* font_id) const {
416  const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
417  *unichar_id = unichar_and_fonts.unichar_id;
418  *font_id = unichar_and_fonts.font_ids[0];
419 }
420 
421 // Expands all the classes/fonts in the shape individually to build
422 // a ShapeTable.
424  const ShapeTable& master_shapes) {
425  BitVector shape_map(master_shapes.NumShapes());
426  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
427  for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
428  int c = shape[u_ind].unichar_id;
429  int f = shape[u_ind].font_ids[f_ind];
430  int master_id = master_shapes.FindShape(c, f);
431  if (master_id >= 0) {
432  shape_map.SetBit(master_id);
433  } else if (FindShape(c, f) < 0) {
434  AddShape(c, f);
435  }
436  }
437  }
438  int num_masters = 0;
439  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
440  if (shape_map[s]) {
441  AddShape(master_shapes.GetShape(s));
442  ++num_masters;
443  }
444  }
445  return num_masters;
446 }
447 
448 // Returns true if the shapes are already merged.
449 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
450  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
451 }
452 
453 // Returns true if any shape contains multiple unichars.
455  int num_shapes = NumShapes();
456  for (int s1 = 0; s1 < num_shapes; ++s1) {
457  if (MasterDestinationIndex(s1) != s1) continue;
458  if (GetShape(s1).size() > 1)
459  return true;
460  }
461  return false;
462 }
463 
464 // Returns the maximum number of unichars over all shapes.
466  int max_num_unichars = 0;
467  int num_shapes = NumShapes();
468  for (int s = 0; s < num_shapes; ++s) {
469  if (GetShape(s).size() > max_num_unichars)
470  max_num_unichars = GetShape(s).size();
471  }
472  return max_num_unichars;
473 }
474 
475 
476 // Merges shapes with a common unichar over the [start, end) interval.
477 // Assumes single unichar per shape.
478 void ShapeTable::ForceFontMerges(int start, int end) {
479  for (int s1 = start; s1 < end; ++s1) {
480  if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
481  int unichar_id = GetShape(s1)[0].unichar_id;
482  for (int s2 = s1 + 1; s2 < end; ++s2) {
483  if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
484  unichar_id == GetShape(s2)[0].unichar_id) {
485  MergeShapes(s1, s2);
486  }
487  }
488  }
489  }
490  ShapeTable compacted(*unicharset_);
491  compacted.AppendMasterShapes(*this, NULL);
492  *this = compacted;
493 }
494 
495 // Returns the number of unichars in the master shape.
496 int ShapeTable::MasterUnicharCount(int shape_id) const {
497  int master_id = MasterDestinationIndex(shape_id);
498  return GetShape(master_id).size();
499 }
500 
501 // Returns the sum of the font counts in the master shape.
502 int ShapeTable::MasterFontCount(int shape_id) const {
503  int master_id = MasterDestinationIndex(shape_id);
504  const Shape& shape = GetShape(master_id);
505  int font_count = 0;
506  for (int c = 0; c < shape.size(); ++c) {
507  font_count += shape[c].font_ids.size();
508  }
509  return font_count;
510 }
511 
512 // Returns the number of unichars that would result from merging the shapes.
513 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
514  // Do it the easy way for now.
515  int master_id1 = MasterDestinationIndex(shape_id1);
516  int master_id2 = MasterDestinationIndex(shape_id2);
517  Shape combined_shape(*shape_table_[master_id1]);
518  combined_shape.AddShape(*shape_table_[master_id2]);
519  return combined_shape.size();
520 }
521 
522 // Merges two shape_ids, leaving shape_id2 marked as merged.
523 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
524  int master_id1 = MasterDestinationIndex(shape_id1);
525  int master_id2 = MasterDestinationIndex(shape_id2);
526  // Point master_id2 (and all merged shapes) to master_id1.
527  shape_table_[master_id2]->set_destination_index(master_id1);
528  // Add all the shapes of master_id2 to master_id1.
529  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
530 }
531 
532 // Swaps two shape_ids.
533 void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
534  Shape* tmp = shape_table_[shape_id1];
535  shape_table_[shape_id1] = shape_table_[shape_id2];
536  shape_table_[shape_id2] = tmp;
537 }
538 
539 // Returns the destination of this shape, (if merged), taking into account
540 // the fact that the destination may itself have been merged.
541 int ShapeTable::MasterDestinationIndex(int shape_id) const {
542  int dest_id = shape_table_[shape_id]->destination_index();
543  if (dest_id == shape_id || dest_id < 0)
544  return shape_id; // Is master already.
545  int master_id = shape_table_[dest_id]->destination_index();
546  if (master_id == dest_id || master_id < 0)
547  return dest_id; // Dest is the master and shape_id points to it.
548  master_id = MasterDestinationIndex(master_id);
549  return master_id;
550 }
551 
552 // Returns false if the unichars in neither shape is a subset of the other.
553 bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
554  const Shape& shape1 = GetShape(shape_id1);
555  const Shape& shape2 = GetShape(shape_id2);
556  int c1, c2;
557  for (c1 = 0; c1 < shape1.size(); ++c1) {
558  int unichar_id1 = shape1[c1].unichar_id;
559  if (!shape2.ContainsUnichar(unichar_id1))
560  break;
561  }
562  for (c2 = 0; c2 < shape2.size(); ++c2) {
563  int unichar_id2 = shape2[c2].unichar_id;
564  if (!shape1.ContainsUnichar(unichar_id2))
565  break;
566  }
567  return c1 == shape1.size() || c2 == shape2.size();
568 }
569 
570 // Returns false if the unichars in neither shape is a subset of the other.
571 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
572  int shape_id) const {
573  const Shape& merge1 = GetShape(merge_id1);
574  const Shape& merge2 = GetShape(merge_id2);
575  const Shape& shape = GetShape(shape_id);
576  int cm1, cm2, cs;
577  for (cs = 0; cs < shape.size(); ++cs) {
578  int unichar_id = shape[cs].unichar_id;
579  if (!merge1.ContainsUnichar(unichar_id) &&
580  !merge2.ContainsUnichar(unichar_id))
581  break; // Shape is not a subset of the merge.
582  }
583  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
584  int unichar_id1 = merge1[cm1].unichar_id;
585  if (!shape.ContainsUnichar(unichar_id1))
586  break; // Merge is not a subset of shape
587  }
588  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
589  int unichar_id2 = merge2[cm2].unichar_id;
590  if (!shape.ContainsUnichar(unichar_id2))
591  break; // Merge is not a subset of shape
592  }
593  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
594 }
595 
596 // Returns true if the unichar sets are equal between the shapes.
597 bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
598  const Shape& shape1 = GetShape(shape_id1);
599  const Shape& shape2 = GetShape(shape_id2);
600  for (int c1 = 0; c1 < shape1.size(); ++c1) {
601  int unichar_id1 = shape1[c1].unichar_id;
602  if (!shape2.ContainsUnichar(unichar_id1))
603  return false;
604  }
605  for (int c2 = 0; c2 < shape2.size(); ++c2) {
606  int unichar_id2 = shape2[c2].unichar_id;
607  if (!shape1.ContainsUnichar(unichar_id2))
608  return false;
609  }
610  return true;
611 }
612 
613 // Returns true if the unichar sets are equal between the shapes.
614 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
615  int shape_id) const {
616  const Shape& merge1 = GetShape(merge_id1);
617  const Shape& merge2 = GetShape(merge_id2);
618  const Shape& shape = GetShape(shape_id);
619  for (int cs = 0; cs < shape.size(); ++cs) {
620  int unichar_id = shape[cs].unichar_id;
621  if (!merge1.ContainsUnichar(unichar_id) &&
622  !merge2.ContainsUnichar(unichar_id))
623  return false; // Shape has a unichar that appears in neither merge.
624  }
625  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
626  int unichar_id1 = merge1[cm1].unichar_id;
627  if (!shape.ContainsUnichar(unichar_id1))
628  return false; // Merge has a unichar that is not in shape.
629  }
630  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
631  int unichar_id2 = merge2[cm2].unichar_id;
632  if (!shape.ContainsUnichar(unichar_id2))
633  return false; // Merge has a unichar that is not in shape.
634  }
635  return true;
636 }
637 
638 // Returns true if there is a common unichar between the shapes.
639 bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
640  const Shape& shape1 = GetShape(shape_id1);
641  const Shape& shape2 = GetShape(shape_id2);
642  for (int c1 = 0; c1 < shape1.size(); ++c1) {
643  int unichar_id1 = shape1[c1].unichar_id;
644  if (shape2.ContainsUnichar(unichar_id1))
645  return true;
646  }
647  return false;
648 }
649 
650 // Returns true if there is a common font id between the shapes.
651 bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
652  const Shape& shape1 = GetShape(shape_id1);
653  const Shape& shape2 = GetShape(shape_id2);
654  for (int c1 = 0; c1 < shape1.size(); ++c1) {
655  const GenericVector<int>& font_list1 = shape1[c1].font_ids;
656  for (int f = 0; f < font_list1.size(); ++f) {
657  if (shape2.ContainsFont(font_list1[f]))
658  return true;
659  }
660  }
661  return false;
662 }
663 
664 // Appends the master shapes from other to this.
665 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
667  GenericVector<int>* shape_map) {
668  if (shape_map != NULL)
669  shape_map->init_to_size(other.NumShapes(), -1);
670  for (int s = 0; s < other.shape_table_.size(); ++s) {
671  if (other.shape_table_[s]->destination_index() < 0) {
672  int index = AddShape(*other.shape_table_[s]);
673  if (shape_map != NULL)
674  (*shape_map)[s] = index;
675  }
676  }
677 }
678 
679 // Returns the number of master shapes remaining after merging.
681  int num_shapes = 0;
682  for (int s = 0; s < shape_table_.size(); ++s) {
683  if (shape_table_[s]->destination_index() < 0)
684  ++num_shapes;
685  }
686  return num_shapes;
687 }
688 
689 
690 // Adds the unichars of the given shape_id to the vector of results. Any
691 // unichar_id that is already present just has the fonts added to the
692 // font set for that result without adding a new entry in the vector.
693 // NOTE: it is assumed that the results are given to this function in order
694 // of decreasing rating.
695 // The unichar_map vector indicates the index of the results entry containing
696 // each unichar, or -1 if the unichar is not yet included in results.
698  GenericVector<int>* unichar_map,
699  GenericVector<UnicharRating>* results)const {
700  if (shape_rating.joined) {
701  AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
702  results);
703  }
704  if (shape_rating.broken) {
705  AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
706  results);
707  }
708  const Shape& shape = GetShape(shape_rating.shape_id);
709  for (int u = 0; u < shape.size(); ++u) {
710  int result_index = AddUnicharToResults(shape[u].unichar_id,
711  shape_rating.rating,
712  unichar_map, results);
713  for (int f = 0; f < shape[u].font_ids.size(); ++f) {
714  (*results)[result_index].fonts.push_back(
715  ScoredFont(shape[u].font_ids[f],
716  IntCastRounded(shape_rating.rating * MAX_INT16)));
717  }
718  }
719 }
720 
721 // Adds the given unichar_id to the results if needed, updating unichar_map
722 // and returning the index of unichar in results.
723 int ShapeTable::AddUnicharToResults(
724  int unichar_id, float rating, GenericVector<int>* unichar_map,
725  GenericVector<UnicharRating>* results) const {
726  int result_index = unichar_map->get(unichar_id);
727  if (result_index < 0) {
728  UnicharRating result(unichar_id, rating);
729  result_index = results->push_back(result);
730  (*unichar_map)[unichar_id] = result_index;
731  }
732  return result_index;
733 }
734 
735 
736 } // namespace tesseract
static int FirstResultWithUnichar(const GenericVector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
static int FirstResultWithUnichar(const GenericVector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:56
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
int NumShapes() const
Definition: shapetable.h:278
void AddShapeToResults(const ShapeRating &shape_rating, GenericVector< int > *unichar_map, GenericVector< UnicharRating > *results) const
Definition: shapetable.cpp:697
int IntCastRounded(double x)
Definition: helpers.h:172
bool operator==(const Shape &other) const
Definition: shapetable.cpp:206
T & get(int index) const
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:166
bool ContainsFontProperties(const FontInfoTable &font_table, uinT32 properties) const
Definition: shapetable.cpp:178
void AddShape(const Shape &other)
Definition: shapetable.cpp:129
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:74
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:396
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:541
GenericVector< inT32 > font_ids
Definition: shapetable.h:176
void ForceFontMerges(int start, int end)
Definition: shapetable.cpp:478
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:156
unsigned char uinT8
Definition: host.h:32
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:67
int NumMasterShapes() const
Definition: shapetable.cpp:680
void ReMapClassIds(const GenericVector< int > &unicharset_map)
Definition: shapetable.cpp:281
bool Serialize(FILE *fp) const
int push_back(T object)
void add_str_int(const char *str, int number)
Definition: strngs.cpp:384
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:666
STRING SummaryStr() const
Definition: shapetable.cpp:323
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:346
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:513
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:423
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:226
#define MAX_INT16
Definition: host.h:52
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:454
bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:614
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:523
void SwapShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:533
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:90
void DeleteShape(int shape_id)
Definition: shapetable.cpp:371
bool CommonUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:639
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:414
#define MAX(x, y)
Definition: ndminx.h:24
bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:571
bool AlreadyMerged(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:449
int MasterFontCount(int shape_id) const
Definition: shapetable.cpp:502
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:99
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:211
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:191
int MasterUnicharCount(int shape_id) const
Definition: shapetable.cpp:496
int size() const
Definition: shapetable.h:202
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:211
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:110
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:323
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:379
bool CommonFont(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:651
unsigned int uinT32
Definition: host.h:36
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:140
void AddShapeToShape(int shape_id, const Shape &other)
Definition: shapetable.cpp:386
bool SubsetUnichar(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:553
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:291
int MaxNumUnichars() const
Definition: shapetable.cpp:465
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
void init_to_size(int size, T t)
bool EqualUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:597
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:83
bool DeSerialize(bool swap, FILE *fp)
int UNICHAR_ID
Definition: unichar.h:33