tesseract  3.05.02
tesseract::MasterTrainer Class Reference

#include <mastertrainer.h>

Public Member Functions

 MasterTrainer (NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
 
 ~MasterTrainer ()
 
bool Serialize (FILE *fp) const
 
bool DeSerialize (bool swap, FILE *fp)
 
void LoadUnicharset (const char *filename)
 
void SetFeatureSpace (const IntFeatureSpace &fs)
 
void ReadTrainingSamples (const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
 
void AddSample (bool verification, const char *unichar_str, TrainingSample *sample)
 
void LoadPageImages (const char *filename)
 
void PostLoadCleanup ()
 
void PreTrainingSetup ()
 
void SetupMasterShapes ()
 
void IncludeJunk ()
 
void ReplicateAndRandomizeSamplesIfRequired ()
 
bool LoadFontInfo (const char *filename)
 
bool LoadXHeights (const char *filename)
 
bool AddSpacingInfo (const char *filename)
 
int GetFontInfoId (const char *font_name)
 
int GetBestMatchingFontInfoId (const char *filename)
 
const STRINGGetTRFileName (int index) const
 
void SetupFlatShapeTable (ShapeTable *shape_table)
 
CLUSTERERSetupForClustering (const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
 
void WriteInttempAndPFFMTable (const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
 
const UNICHARSETunicharset () const
 
TrainingSampleSetGetSamples ()
 
const ShapeTablemaster_shapes () const
 
void DebugCanonical (const char *unichar_str1, const char *unichar_str2)
 
void DisplaySamples (const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
 
void TestClassifierVOld (bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
 
void TestClassifierOnSamples (CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
 
double TestClassifier (CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
 
float ShapeDistance (const ShapeTable &shapes, int s1, int s2)
 

Detailed Description

Definition at line 69 of file mastertrainer.h.

Constructor & Destructor Documentation

◆ MasterTrainer()

tesseract::MasterTrainer::MasterTrainer ( NormalizationMode  norm_mode,
bool  shape_analysis,
bool  replicate_samples,
int  debug_level 
)

Definition at line 54 of file mastertrainer.cpp.

58  : norm_mode_(norm_mode), samples_(fontinfo_table_),
59  junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
60  charsetsize_(0),
61  enable_shape_anaylsis_(shape_analysis),
62  enable_replication_(replicate_samples),
63  fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
64 }

◆ ~MasterTrainer()

tesseract::MasterTrainer::~MasterTrainer ( )

Definition at line 66 of file mastertrainer.cpp.

66  {
67  delete [] fragments_;
68  for (int p = 0; p < page_images_.size(); ++p)
69  pixDestroy(&page_images_[p]);
70 }
int size() const
Definition: genericvector.h:72

Member Function Documentation

◆ AddSample()

void tesseract::MasterTrainer::AddSample ( bool  verification,
const char *  unichar_str,
TrainingSample sample 
)

Definition at line 186 of file mastertrainer.cpp.

187  {
188  if (verification) {
189  verify_samples_.AddSample(unichar, sample);
190  prev_unichar_id_ = -1;
191  } else if (unicharset_.contains_unichar(unichar)) {
192  if (prev_unichar_id_ >= 0)
193  fragments_[prev_unichar_id_] = -1;
194  prev_unichar_id_ = samples_.AddSample(unichar, sample);
195  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
196  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
197  } else {
198  int junk_id = junk_samples_.AddSample(unichar, sample);
199  if (prev_unichar_id_ >= 0) {
201  if (frag != NULL && frag->is_natural()) {
202  if (fragments_[prev_unichar_id_] == 0)
203  fragments_[prev_unichar_id_] = junk_id;
204  else if (fragments_[prev_unichar_id_] != junk_id)
205  fragments_[prev_unichar_id_] = -1;
206  }
207  delete frag;
208  }
209  prev_unichar_id_ = -1;
210  }
211 }
bool TESS_API contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:396
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:346
int AddSample(const char *unichar, TrainingSample *sample)
bool is_natural() const
Definition: unicharset.h:107
Definition: cluster.h:32
static CHAR_FRAGMENT * parse_from_string(const char *str)

◆ AddSpacingInfo()

bool tesseract::MasterTrainer::AddSpacingInfo ( const char *  filename)

Definition at line 435 of file mastertrainer.cpp.

435  {
436  FILE* fontinfo_file = fopen(filename, "rb");
437  if (fontinfo_file == NULL)
438  return true; // We silently ignore missing files!
439  // Find the fontinfo_id.
440  int fontinfo_id = GetBestMatchingFontInfoId(filename);
441  if (fontinfo_id < 0) {
442  tprintf("No font found matching fontinfo filename %s\n", filename);
443  fclose(fontinfo_file);
444  return false;
445  }
446  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
447  // TODO(rays) scale should probably be a double, but keep as an int for now
448  // to duplicate current behavior.
449  int scale = kBlnXHeight / xheights_[fontinfo_id];
450  int num_unichars;
451  char uch[UNICHAR_LEN];
452  char kerned_uch[UNICHAR_LEN];
453  int x_gap, x_gap_before, x_gap_after, num_kerned;
454  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
455  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
456  fi->init_spacing(unicharset_.size());
457  FontSpacingInfo *spacing = NULL;
458  for (int l = 0; l < num_unichars; ++l) {
459  if (tfscanf(fontinfo_file, "%s %d %d %d",
460  uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
461  tprintf("Bad format of font spacing file %s\n", filename);
462  fclose(fontinfo_file);
463  return false;
464  }
465  bool valid = unicharset_.contains_unichar(uch);
466  if (valid) {
467  spacing = new FontSpacingInfo();
468  spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
469  spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
470  }
471  for (int k = 0; k < num_kerned; ++k) {
472  if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
473  tprintf("Bad format of font spacing file %s\n", filename);
474  fclose(fontinfo_file);
475  delete spacing;
476  return false;
477  }
478  if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
479  spacing->kerned_unichar_ids.push_back(
480  unicharset_.unichar_to_id(kerned_uch));
481  spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
482  }
483  if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
484  }
485  fclose(fontinfo_file);
486  return true;
487 }
short inT16
Definition: host.h:33
T & get(int index) const
bool TESS_API contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int size() const
Definition: unicharset.h:297
int GetBestMatchingFontInfoId(const char *filename)
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
#define UNICHAR_LEN
Definition: unichar.h:30
const int kBlnXHeight
Definition: normalis.h:28
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ DebugCanonical()

void tesseract::MasterTrainer::DebugCanonical ( const char *  unichar_str1,
const char *  unichar_str2 
)

Definition at line 651 of file mastertrainer.cpp.

652  {
653  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
654  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
655  if (class_id2 == INVALID_UNICHAR_ID)
656  class_id2 = class_id1;
657  if (class_id1 == INVALID_UNICHAR_ID) {
658  tprintf("No unicharset entry found for %s\n", unichar_str1);
659  return;
660  } else {
661  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
662  class_id1, unichar_str1, class_id2, unichar_str2);
663  }
664  int num_fonts = samples_.NumFonts();
665  const IntFeatureMap& feature_map = feature_map_;
666  // Iterate the fonts to get the similarity with other fonst of the same
667  // class.
668  tprintf(" ");
669  for (int f = 0; f < num_fonts; ++f) {
670  if (samples_.NumClassSamples(f, class_id2, false) == 0)
671  continue;
672  tprintf("%6d", f);
673  }
674  tprintf("\n");
675  for (int f1 = 0; f1 < num_fonts; ++f1) {
676  // Map the features of the canonical_sample.
677  if (samples_.NumClassSamples(f1, class_id1, false) == 0)
678  continue;
679  tprintf("%4d ", f1);
680  for (int f2 = 0; f2 < num_fonts; ++f2) {
681  if (samples_.NumClassSamples(f2, class_id2, false) == 0)
682  continue;
683  float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
684  feature_map);
685  tprintf(" %5.3f", dist);
686  }
687  tprintf("\n");
688  }
689  // Build a fake ShapeTable containing all the sample types.
690  ShapeTable shapes(unicharset_);
691  for (int f = 0; f < num_fonts; ++f) {
692  if (samples_.NumClassSamples(f, class_id1, true) > 0)
693  shapes.AddShape(class_id1, f);
694  if (class_id1 != class_id2 &&
695  samples_.NumClassSamples(f, class_id2, true) > 0)
696  shapes.AddShape(class_id2, f);
697  }
698 }
int NumClassSamples(int font_id, int class_id, bool randomize) const
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)

◆ DeSerialize()

bool tesseract::MasterTrainer::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 91 of file mastertrainer.cpp.

91  {
92  if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
93  if (swap) {
94  ReverseN(&norm_mode_, sizeof(norm_mode_));
95  }
96  if (!unicharset_.load_from_file(fp)) return false;
97  charsetsize_ = unicharset_.size();
98  if (!feature_space_.DeSerialize(swap, fp)) return false;
99  feature_map_.Init(feature_space_);
100  if (!samples_.DeSerialize(swap, fp)) return false;
101  if (!junk_samples_.DeSerialize(swap, fp)) return false;
102  if (!verify_samples_.DeSerialize(swap, fp)) return false;
103  if (!master_shapes_.DeSerialize(swap, fp)) return false;
104  if (!flat_shapes_.DeSerialize(swap, fp)) return false;
105  if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
106  if (!xheights_.DeSerialize(swap, fp)) return false;
107  return true;
108 }
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
bool DeSerialize(bool swap, FILE *fp)
int size() const
Definition: unicharset.h:297
bool DeSerialize(bool swap, FILE *fp)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
bool DeSerialize(bool swap, FILE *fp)
Definition: fontinfo.cpp:54
void TESS_API Init(const IntFeatureSpace &feature_space)
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
bool DeSerialize(bool swap, FILE *fp)

◆ DisplaySamples()

void tesseract::MasterTrainer::DisplaySamples ( const char *  unichar_str1,
int  cloud_font,
const char *  unichar_str2,
int  canonical_font 
)

Definition at line 711 of file mastertrainer.cpp.

713  {
714  const IntFeatureMap& feature_map = feature_map_;
715  const IntFeatureSpace& feature_space = feature_map.feature_space();
716  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
718  f_window);
719  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
720  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
721  const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
722  class_id2);
723  for (int f = 0; f < sample->num_features(); ++f) {
724  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
725  }
726  }
727  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
728  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
729  const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
730  for (int f = 0; f < cloud.size(); ++f) {
731  if (cloud[f]) {
732  INT_FEATURE_STRUCT feature =
733  feature_map.InverseIndexFeature(f);
734  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
735  }
736  }
737  }
738  f_window->Update();
739  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
740  SVEventType ev_type;
741  do {
742  SVEvent* ev;
743  // Wait until a click or popup event.
744  ev = f_window->AwaitEvent(SVET_ANY);
745  ev_type = ev->type;
746  if (ev_type == SVET_CLICK) {
747  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
748  if (feature_index >= 0) {
749  // Iterate samples and display those with the feature.
750  Shape shape;
751  shape.AddToShape(class_id1, cloud_font);
752  s_window->Clear();
753  samples_.DisplaySamplesWithFeature(feature_index, shape,
754  feature_space, ScrollView::GREEN,
755  s_window);
756  s_window->Update();
757  }
758  }
759  delete ev;
760  } while (ev_type != SVET_DESTROY);
761 }
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1920
SVEventType
Definition: scrollview.h:45
SVEventType type
Definition: scrollview.h:64
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
static void Update()
Definition: scrollview.cpp:715
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1755
const UNICHARSET & unicharset() const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:1095
void Clear()
Definition: scrollview.cpp:595
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
Definition: cluster.h:32
int y
Definition: scrollview.h:67
int x
Definition: scrollview.h:66

◆ GetBestMatchingFontInfoId()

int tesseract::MasterTrainer::GetBestMatchingFontInfoId ( const char *  filename)

Definition at line 502 of file mastertrainer.cpp.

502  {
503  int fontinfo_id = -1;
504  int best_len = 0;
505  for (int f = 0; f < fontinfo_table_.size(); ++f) {
506  if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
507  int len = strlen(fontinfo_table_.get(f).name);
508  // Use the longest matching length in case a substring of a font matched.
509  if (len > best_len) {
510  best_len = len;
511  fontinfo_id = f;
512  }
513  }
514  }
515  return fontinfo_id;
516 }
T & get(int index) const
int size() const
Definition: genericvector.h:72

◆ GetFontInfoId()

int tesseract::MasterTrainer::GetFontInfoId ( const char *  font_name)

Definition at line 491 of file mastertrainer.cpp.

491  {
492  FontInfo fontinfo;
493  // We are only borrowing the string, so it is OK to const cast it.
494  fontinfo.name = const_cast<char*>(font_name);
495  fontinfo.properties = 0; // Not used to lookup in the table
496  fontinfo.universal_id = 0;
497  return fontinfo_table_.get_index(fontinfo);
498 }
int get_index(T object) const

◆ GetSamples()

TrainingSampleSet* tesseract::MasterTrainer::GetSamples ( )
inline

Definition at line 192 of file mastertrainer.h.

192  {
193  return &samples_;
194  }

◆ GetTRFileName()

const STRING& tesseract::MasterTrainer::GetTRFileName ( int  index) const
inline

Definition at line 165 of file mastertrainer.h.

165  {
166  return tr_filenames_[index];
167  }

◆ IncludeJunk()

void tesseract::MasterTrainer::IncludeJunk ( )

Definition at line 318 of file mastertrainer.cpp.

318  {
319  // Get ids of fragments in junk_samples_ that replace the dead chars.
320  const UNICHARSET& junk_set = junk_samples_.unicharset();
321  const UNICHARSET& sample_set = samples_.unicharset();
322  int num_junks = junk_samples_.num_samples();
323  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
324  for (int s = 0; s < num_junks; ++s) {
325  TrainingSample* sample = junk_samples_.mutable_sample(s);
326  int junk_id = sample->class_id();
327  const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
328  int sample_id = sample_set.unichar_to_id(junk_utf8);
329  if (sample_id == INVALID_UNICHAR_ID)
330  sample_id = 0;
331  sample->set_class_id(sample_id);
332  junk_samples_.extract_sample(s);
333  samples_.AddSample(sample_id, sample);
334  }
335  junk_samples_.DeleteDeadSamples();
336  samples_.OrganizeByFontAndClass();
337 }
const UNICHARSET & unicharset() const
int AddSample(const char *unichar, TrainingSample *sample)
TrainingSample * mutable_sample(int index)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
TrainingSample * extract_sample(int index)
Definition: cluster.h:32
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

◆ LoadFontInfo()

bool tesseract::MasterTrainer::LoadFontInfo ( const char *  filename)

Definition at line 356 of file mastertrainer.cpp.

356  {
357  FILE* fp = fopen(filename, "rb");
358  if (fp == NULL) {
359  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
360  return false;
361  }
362  int italic, bold, fixed, serif, fraktur;
363  while (!feof(fp)) {
364  FontInfo fontinfo;
365  char* font_name = new char[1024];
366  fontinfo.name = font_name;
367  fontinfo.properties = 0;
368  fontinfo.universal_id = 0;
369  if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
370  &fixed, &serif, &fraktur) != 6) {
371  delete[] font_name;
372  continue;
373  }
374  fontinfo.properties =
375  (italic << 0) +
376  (bold << 1) +
377  (fixed << 2) +
378  (serif << 3) +
379  (fraktur << 4);
380  if (!fontinfo_table_.contains(fontinfo)) {
381  fontinfo_table_.push_back(fontinfo);
382  } else {
383  delete[] font_name;
384  }
385  }
386  fclose(fp);
387  return true;
388 }
bool contains(T object) const
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
int push_back(T object)

◆ LoadPageImages()

void tesseract::MasterTrainer::LoadPageImages ( const char *  filename)

Definition at line 216 of file mastertrainer.cpp.

216  {
217  size_t offset = 0;
218  int page;
219  Pix* pix;
220  for (page = 0; ; page++) {
221  pix = pixReadFromMultipageTiff(filename, &offset);
222  if (!pix) break;
223  page_images_.push_back(pix);
224  if (!offset) break;
225  }
226  tprintf("Loaded %d page images from %s\n", page, filename);
227 }
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31

◆ LoadUnicharset()

void tesseract::MasterTrainer::LoadUnicharset ( const char *  filename)

Definition at line 111 of file mastertrainer.cpp.

111  {
112  if (!unicharset_.load_from_file(filename)) {
113  tprintf("Failed to load unicharset from file %s\n"
114  "Building unicharset for training from scratch...\n",
115  filename);
116  unicharset_.clear();
117  UNICHARSET initialized;
118  // Add special characters, as they were removed by the clear, but the
119  // default constructor puts them in.
120  unicharset_.AppendOtherUnicharset(initialized);
121  }
122  charsetsize_ = unicharset_.size();
123  delete [] fragments_;
124  fragments_ = new int[charsetsize_];
125  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
126  samples_.LoadUnicharset(filename);
127  junk_samples_.LoadUnicharset(filename);
128  verify_samples_.LoadUnicharset(filename);
129 }
void LoadUnicharset(const char *filename)
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
int size() const
Definition: unicharset.h:297
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void clear()
Definition: unicharset.h:265
#define tprintf(...)
Definition: tprintf.h:31

◆ LoadXHeights()

bool tesseract::MasterTrainer::LoadXHeights ( const char *  filename)

Definition at line 392 of file mastertrainer.cpp.

392  {
393  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
394  xheights_.init_to_size(fontinfo_table_.size(), -1);
395  if (filename == NULL) return true;
396  FILE *f = fopen(filename, "rb");
397  if (f == NULL) {
398  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
399  return false;
400  }
401  tprintf("Reading x-heights from %s ...\n", filename);
402  FontInfo fontinfo;
403  fontinfo.properties = 0; // Not used to lookup in the table.
404  fontinfo.universal_id = 0;
405  char buffer[1024];
406  int xht;
407  int total_xheight = 0;
408  int xheight_count = 0;
409  while (!feof(f)) {
410  if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2)
411  continue;
412  buffer[1023] = '\0';
413  fontinfo.name = buffer;
414  if (!fontinfo_table_.contains(fontinfo)) continue;
415  int fontinfo_id = fontinfo_table_.get_index(fontinfo);
416  xheights_[fontinfo_id] = xht;
417  total_xheight += xht;
418  ++xheight_count;
419  }
420  if (xheight_count == 0) {
421  fprintf(stderr, "No valid xheights in %s!\n", filename);
422  fclose(f);
423  return false;
424  }
425  int mean_xheight = DivRounded(total_xheight, xheight_count);
426  for (int i = 0; i < fontinfo_table_.size(); ++i) {
427  if (xheights_[i] < 0)
428  xheights_[i] = mean_xheight;
429  }
430  fclose(f);
431  return true;
432 } // LoadXHeights
bool contains(T object) const
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
int get_index(T object) const
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
int DivRounded(int a, int b)
Definition: helpers.h:166
void init_to_size(int size, T t)

◆ master_shapes()

const ShapeTable& tesseract::MasterTrainer::master_shapes ( ) const
inline

Definition at line 195 of file mastertrainer.h.

195  {
196  return master_shapes_;
197  }

◆ PostLoadCleanup()

void tesseract::MasterTrainer::PostLoadCleanup ( )

Definition at line 234 of file mastertrainer.cpp.

234  {
235  if (debug_level_ > 0)
236  tprintf("PostLoadCleanup...\n");
237  if (enable_shape_anaylsis_)
238  ReplaceFragmentedSamples();
239  SampleIterator sample_it;
240  sample_it.Init(NULL, NULL, true, &verify_samples_);
241  sample_it.NormalizeSamples();
242  verify_samples_.OrganizeByFontAndClass();
243 
244  samples_.IndexFeatures(feature_space_);
245  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
246  // against current training.
247  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
248  samples_.OrganizeByFontAndClass();
249  if (debug_level_ > 0)
250  tprintf("ComputeCanonicalSamples...\n");
251  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
252 }
void IndexFeatures(const IntFeatureSpace &feature_space)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
#define tprintf(...)
Definition: tprintf.h:31

◆ PreTrainingSetup()

void tesseract::MasterTrainer::PreTrainingSetup ( )

Definition at line 257 of file mastertrainer.cpp.

257  {
258  if (debug_level_ > 0)
259  tprintf("PreTrainingSetup...\n");
260  samples_.IndexFeatures(feature_space_);
261  samples_.ComputeCanonicalFeatures();
262  if (debug_level_ > 0)
263  tprintf("ComputeCloudFeatures...\n");
264  samples_.ComputeCloudFeatures(feature_space_.Size());
265 }
void IndexFeatures(const IntFeatureSpace &feature_space)
#define tprintf(...)
Definition: tprintf.h:31
void ComputeCloudFeatures(int feature_space_size)

◆ ReadTrainingSamples()

void tesseract::MasterTrainer::ReadTrainingSamples ( const char *  page_name,
const FEATURE_DEFS_STRUCT feature_defs,
bool  verification 
)

Definition at line 135 of file mastertrainer.cpp.

137  {
138  char buffer[2048];
139  int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
140  int micro_feature_type = ShortNameToFeatureType(feature_defs,
142  int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
143  int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
144 
145  FILE* fp = Efopen(page_name, "rb");
146  if (fp == NULL) {
147  tprintf("Failed to open tr file: %s\n", page_name);
148  return;
149  }
150  tr_filenames_.push_back(STRING(page_name));
151  while (fgets(buffer, sizeof(buffer), fp) != NULL) {
152  if (buffer[0] == '\n')
153  continue;
154 
155  char* space = strchr(buffer, ' ');
156  if (space == NULL) {
157  tprintf("Bad format in tr file, reading fontname, unichar\n");
158  continue;
159  }
160  *space++ = '\0';
161  int font_id = GetFontInfoId(buffer);
162  if (font_id < 0) font_id = 0;
163  int page_number;
164  STRING unichar;
165  TBOX bounding_box;
166  if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
167  tprintf("Bad format in tr file, reading box coords\n");
168  continue;
169  }
170  CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
171  TrainingSample* sample = new TrainingSample;
172  sample->set_font_id(font_id);
173  sample->set_page_num(page_number + page_images_.size());
174  sample->set_bounding_box(bounding_box);
175  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
176  cn_feature_type, geo_feature_type, char_desc);
177  AddSample(verification, unichar.string(), sample);
178  FreeCharDescription(char_desc);
179  }
180  charsetsize_ = unicharset_.size();
181  fclose(fp);
182 }
FEATURE_DEFS_STRUCT feature_defs
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
int size() const
Definition: unicharset.h:297
int push_back(T object)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
const char * string() const
Definition: strngs.cpp:201
const char * kMicroFeatureType
Definition: featdefs.cpp:41
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
const char * kGeoFeatureType
Definition: featdefs.cpp:44
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:166
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
int GetFontInfoId(const char *font_name)
Definition: cluster.h:32
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:141
Definition: rect.h:30
const char * kIntFeatureType
Definition: featdefs.cpp:43
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
const char * kCNFeatureType
Definition: featdefs.cpp:42

◆ ReplicateAndRandomizeSamplesIfRequired()

void tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired ( )

Definition at line 344 of file mastertrainer.cpp.

344  {
345  if (enable_replication_) {
346  if (debug_level_ > 0)
347  tprintf("ReplicateAndRandomize...\n");
348  verify_samples_.ReplicateAndRandomizeSamples();
349  samples_.ReplicateAndRandomizeSamples();
350  samples_.IndexFeatures(feature_space_);
351  }
352 }
void IndexFeatures(const IntFeatureSpace &feature_space)
#define tprintf(...)
Definition: tprintf.h:31

◆ Serialize()

bool tesseract::MasterTrainer::Serialize ( FILE *  fp) const

Definition at line 75 of file mastertrainer.cpp.

75  {
76  if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
77  if (!unicharset_.save_to_file(fp)) return false;
78  if (!feature_space_.Serialize(fp)) return false;
79  if (!samples_.Serialize(fp)) return false;
80  if (!junk_samples_.Serialize(fp)) return false;
81  if (!verify_samples_.Serialize(fp)) return false;
82  if (!master_shapes_.Serialize(fp)) return false;
83  if (!flat_shapes_.Serialize(fp)) return false;
84  if (!fontinfo_table_.Serialize(fp)) return false;
85  if (!xheights_.Serialize(fp)) return false;
86  return true;
87 }
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:49
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
bool Serialize(FILE *fp) const
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
bool Serialize(FILE *fp) const
bool Serialize(FILE *fp) const

◆ SetFeatureSpace()

void tesseract::MasterTrainer::SetFeatureSpace ( const IntFeatureSpace fs)
inline

Definition at line 85 of file mastertrainer.h.

85  {
86  feature_space_ = fs;
87  feature_map_.Init(fs);
88  }
void TESS_API Init(const IntFeatureSpace &feature_space)

◆ SetupFlatShapeTable()

void tesseract::MasterTrainer::SetupFlatShapeTable ( ShapeTable shape_table)

Definition at line 519 of file mastertrainer.cpp.

519  {
520  // To exactly mimic the results of the previous implementation, the shapes
521  // must be clustered in order the fonts arrived, and reverse order of the
522  // characters within each font.
523  // Get a list of the fonts in the order they appeared.
524  GenericVector<int> active_fonts;
525  int num_shapes = flat_shapes_.NumShapes();
526  for (int s = 0; s < num_shapes; ++s) {
527  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
528  int f = 0;
529  for (f = 0; f < active_fonts.size(); ++f) {
530  if (active_fonts[f] == font)
531  break;
532  }
533  if (f == active_fonts.size())
534  active_fonts.push_back(font);
535  }
536  // For each font in order, add all the shapes with that font in reverse order.
537  int num_fonts = active_fonts.size();
538  for (int f = 0; f < num_fonts; ++f) {
539  for (int s = num_shapes - 1; s >= 0; --s) {
540  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
541  if (font == active_fonts[f]) {
542  shape_table->AddShape(flat_shapes_.GetShape(s));
543  }
544  }
545  }
546 }
int NumShapes() const
Definition: shapetable.h:278
void AddShape(const Shape &other)
Definition: shapetable.cpp:129
int push_back(T object)
int size() const
Definition: genericvector.h:72
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:323

◆ SetupForClustering()

CLUSTERER * tesseract::MasterTrainer::SetupForClustering ( const ShapeTable shape_table,
const FEATURE_DEFS_STRUCT feature_defs,
int  shape_id,
int *  num_samples 
)

Definition at line 550 of file mastertrainer.cpp.

554  {
555 
557  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
558  ASSERT_HOST(num_params == MFCount);
559  CLUSTERER* clusterer = MakeClusterer(
560  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
561 
562  // We want to iterate over the samples of just the one shape.
563  IndexMapBiDi shape_map;
564  shape_map.Init(shape_table.NumShapes(), false);
565  shape_map.SetMap(shape_id, true);
566  shape_map.Setup();
567  // Reverse the order of the samples to match the previous behavior.
569  SampleIterator it;
570  it.Init(&shape_map, &shape_table, false, &samples_);
571  for (it.Begin(); !it.AtEnd(); it.Next()) {
572  sample_ptrs.push_back(&it.GetSample());
573  }
574  int sample_id = 0;
575  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
576  const TrainingSample* sample = sample_ptrs[i];
577  int num_features = sample->num_micro_features();
578  for (int f = 0; f < num_features; ++f)
579  MakeSample(clusterer, sample->micro_features()[f], sample_id);
580  ++sample_id;
581  }
582  *num_samples = sample_id;
583  return clusterer;
584 }
FEATURE_DEFS_STRUCT feature_defs
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:456
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:400
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
Definition: mf.h:30
int push_back(T object)
const char * kMicroFeatureType
Definition: featdefs.cpp:41
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
int size() const
Definition: genericvector.h:72
Definition: cluster.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SetupMasterShapes()

void tesseract::MasterTrainer::SetupMasterShapes ( )

Definition at line 269 of file mastertrainer.cpp.

269  {
270  tprintf("Building master shape table\n");
271  int num_fonts = samples_.NumFonts();
272 
273  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
274  ShapeTable char_shapes_end_fragment(samples_.unicharset());
275  ShapeTable char_shapes(samples_.unicharset());
276  for (int c = 0; c < samples_.charsetsize(); ++c) {
277  ShapeTable shapes(samples_.unicharset());
278  for (int f = 0; f < num_fonts; ++f) {
279  if (samples_.NumClassSamples(f, c, true) > 0)
280  shapes.AddShape(c, f);
281  }
282  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
283 
284  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
285 
286  if (fragment == NULL)
287  char_shapes.AppendMasterShapes(shapes, NULL);
288  else if (fragment->is_beginning())
289  char_shapes_begin_fragment.AppendMasterShapes(shapes, NULL);
290  else if (fragment->is_ending())
291  char_shapes_end_fragment.AppendMasterShapes(shapes, NULL);
292  else
293  char_shapes.AppendMasterShapes(shapes, NULL);
294  }
296  kFontMergeDistance, &char_shapes_begin_fragment);
297  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, NULL);
299  kFontMergeDistance, &char_shapes_end_fragment);
300  char_shapes.AppendMasterShapes(char_shapes_end_fragment, NULL);
302  kFontMergeDistance, &char_shapes);
303  master_shapes_.AppendMasterShapes(char_shapes, NULL);
304  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
305 }
int NumClassSamples(int font_id, int class_id, bool randomize) const
const float kFontMergeDistance
const UNICHARSET & unicharset() const
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:666
STRING SummaryStr() const
Definition: shapetable.cpp:323
bool is_ending() const
Definition: unicharset.h:102
const char * string() const
Definition: strngs.cpp:201
const int kMinClusteredShapes
#define tprintf(...)
Definition: tprintf.h:31
const int kMaxUnicharsPerCluster
bool is_beginning() const
Definition: unicharset.h:99
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682

◆ ShapeDistance()

float tesseract::MasterTrainer::ShapeDistance ( const ShapeTable shapes,
int  s1,
int  s2 
)

Definition at line 825 of file mastertrainer.cpp.

825  {
826  const IntFeatureMap& feature_map = feature_map_;
827  const Shape& shape1 = shapes.GetShape(s1);
828  const Shape& shape2 = shapes.GetShape(s2);
829  int num_chars1 = shape1.size();
830  int num_chars2 = shape2.size();
831  float dist_sum = 0.0f;
832  int dist_count = 0;
833  if (num_chars1 > 1 || num_chars2 > 1) {
834  // In the multi-char case try to optimize the calculation by computing
835  // distances between characters of matching font where possible.
836  for (int c1 = 0; c1 < num_chars1; ++c1) {
837  for (int c2 = 0; c2 < num_chars2; ++c2) {
838  dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
839  true, feature_map);
840  ++dist_count;
841  }
842  }
843  } else {
844  // In the single unichar case, there is little alternative, but to compute
845  // the squared-order distance between pairs of fonts.
846  dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
847  false, feature_map);
848  ++dist_count;
849  }
850  return dist_sum / dist_count;
851 }
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)

◆ TestClassifier()

double tesseract::MasterTrainer::TestClassifier ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
TrainingSampleSet samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 798 of file mastertrainer.cpp.

803  {
804  SampleIterator sample_it;
805  sample_it.Init(NULL, NULL, replicate_samples, samples);
806  if (report_level > 0) {
807  int num_samples = 0;
808  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
809  ++num_samples;
810  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
811  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
812  test_classifier->GetShapeTable()->NumShapes(), num_samples);
813  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
814  }
815  double unichar_error = 0.0;
816  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
817  error_mode, fontinfo_table_,
818  page_images_, &sample_it, &unichar_error,
819  NULL, report_string);
820  return unichar_error;
821 }
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
#define tprintf(...)
Definition: tprintf.h:31

◆ TestClassifierOnSamples()

void tesseract::MasterTrainer::TestClassifierOnSamples ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 776 of file mastertrainer.cpp.

780  {
781  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
782  test_classifier, report_string);
783 }
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)

◆ TestClassifierVOld()

void tesseract::MasterTrainer::TestClassifierVOld ( bool  replicate_samples,
ShapeClassifier test_classifier,
ShapeClassifier old_classifier 
)

Definition at line 764 of file mastertrainer.cpp.

766  {
767  SampleIterator sample_it;
768  sample_it.Init(NULL, NULL, replicate_samples, &samples_);
769  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
770  CT_UNICHAR_TOPN_ERR, fontinfo_table_,
771  page_images_, &sample_it);
772 }
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it)

◆ unicharset()

const UNICHARSET& tesseract::MasterTrainer::unicharset ( ) const
inline

Definition at line 189 of file mastertrainer.h.

189  {
190  return samples_.unicharset();
191  }
const UNICHARSET & unicharset() const

◆ WriteInttempAndPFFMTable()

void tesseract::MasterTrainer::WriteInttempAndPFFMTable ( const UNICHARSET unicharset,
const UNICHARSET shape_set,
const ShapeTable shape_table,
CLASS_STRUCT float_classes,
const char *  inttemp_file,
const char *  pffmtable_file 
)

Definition at line 590 of file mastertrainer.cpp.

595  {
596  tesseract::Classify *classify = new tesseract::Classify();
597  // Move the fontinfo table to classify.
598  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
599  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
600  shape_set);
601  FILE* fp = fopen(inttemp_file, "wb");
602  classify->WriteIntTemplates(fp, int_templates, shape_set);
603  fclose(fp);
604  // Now write pffmtable. This is complicated by the fact that the adaptive
605  // classifier still wants one indexed by unichar-id, but the static
606  // classifier needs one indexed by its shape class id.
607  // We put the shapetable_cutoffs in a GenericVector, and compute the
608  // unicharset cutoffs along the way.
609  GenericVector<uinT16> shapetable_cutoffs;
610  GenericVector<uinT16> unichar_cutoffs;
611  for (int c = 0; c < unicharset.size(); ++c)
612  unichar_cutoffs.push_back(0);
613  /* then write out each class */
614  for (int i = 0; i < int_templates->NumClasses; ++i) {
615  INT_CLASS Class = ClassForClassId(int_templates, i);
616  // Todo: Test with min instead of max
617  // int MaxLength = LengthForConfigId(Class, 0);
618  uinT16 max_length = 0;
619  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
620  // Todo: Test with min instead of max
621  // if (LengthForConfigId (Class, config_id) < MaxLength)
622  uinT16 length = Class->ConfigLengths[config_id];
623  if (length > max_length)
624  max_length = Class->ConfigLengths[config_id];
625  int shape_id = float_classes[i].font_set.get(config_id);
626  const Shape& shape = shape_table.GetShape(shape_id);
627  for (int c = 0; c < shape.size(); ++c) {
628  int unichar_id = shape[c].unichar_id;
629  if (length > unichar_cutoffs[unichar_id])
630  unichar_cutoffs[unichar_id] = length;
631  }
632  }
633  shapetable_cutoffs.push_back(max_length);
634  }
635  fp = fopen(pffmtable_file, "wb");
636  shapetable_cutoffs.Serialize(fp);
637  for (int c = 0; c < unicharset.size(); ++c) {
638  const char *unichar = unicharset.id_to_unichar(c);
639  if (strcmp(unichar, " ") == 0) {
640  unichar = "NULL";
641  }
642  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
643  }
644  fclose(fp);
645  free_int_templates(int_templates);
646  delete classify;
647 }
int size() const
Definition: unicharset.h:297
UnicityTableEqEq< int > font_set
Definition: protos.h:65
bool Serialize(FILE *fp) const
int push_back(T object)
#define ClassForClassId(T, c)
Definition: intproto.h:181
void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:106
unsigned short uinT16
Definition: host.h:34
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:739
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1129
const T & get(int id) const
Return the object from an id.
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
uinT16 ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:113
uinT8 NumConfigs
Definition: intproto.h:110
const UNICHARSET & unicharset() const
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:557

The documentation for this class was generated from the following files: