tesseract  3.05.02
tesseract::TrainingSampleSet Class Reference

#include <trainingsampleset.h>

Public Member Functions

 TrainingSampleSet (const FontInfoTable &fontinfo_table)
 
 ~TrainingSampleSet ()
 
bool Serialize (FILE *fp) const
 
bool DeSerialize (bool swap, FILE *fp)
 
int num_samples () const
 
int num_raw_samples () const
 
int NumFonts () const
 
const UNICHARSETunicharset () const
 
int charsetsize () const
 
const FontInfoTablefontinfo_table () const
 
void LoadUnicharset (const char *filename)
 
int AddSample (const char *unichar, TrainingSample *sample)
 
void AddSample (int unichar_id, TrainingSample *sample)
 
int NumClassSamples (int font_id, int class_id, bool randomize) const
 
const TrainingSampleGetSample (int index) const
 
const TrainingSampleGetSample (int font_id, int class_id, int index) const
 
TrainingSampleMutableSample (int font_id, int class_id, int index)
 
STRING SampleToString (const TrainingSample &sample) const
 
const BitVectorGetCloudFeatures (int font_id, int class_id) const
 
const GenericVector< int > & GetCanonicalFeatures (int font_id, int class_id) const
 
float UnicharDistance (const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
 
float ClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
 
float ComputeClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
 
int ReliablySeparable (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
 
int GlobalSampleIndex (int font_id, int class_id, int index) const
 
const TrainingSampleGetCanonicalSample (int font_id, int class_id) const
 
float GetCanonicalDist (int font_id, int class_id) const
 
TrainingSamplemutable_sample (int index)
 
TrainingSampleextract_sample (int index)
 
void IndexFeatures (const IntFeatureSpace &feature_space)
 
void DeleteOutliers (const IntFeatureSpace &feature_space, bool debug)
 
void KillSample (TrainingSample *sample)
 
void DeleteDeadSamples ()
 
bool DeleteableSample (const TrainingSample *sample)
 
void OrganizeByFontAndClass ()
 
void SetupFontIdMap ()
 
void ComputeCanonicalSamples (const IntFeatureMap &map, bool debug)
 
void ReplicateAndRandomizeSamples ()
 
void ComputeCanonicalFeatures ()
 
void ComputeCloudFeatures (int feature_space_size)
 
void AddAllFontsForClass (int class_id, Shape *shape) const
 
void DisplaySamplesWithFeature (int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
 

Detailed Description

Definition at line 43 of file trainingsampleset.h.

Constructor & Destructor Documentation

◆ TrainingSampleSet()

tesseract::TrainingSampleSet::TrainingSampleSet ( const FontInfoTable fontinfo_table)
explicit

Definition at line 70 of file trainingsampleset.cpp.

71  : num_raw_samples_(0), unicharset_size_(0),
72  font_class_array_(NULL), fontinfo_table_(font_table) {
73 }

◆ ~TrainingSampleSet()

tesseract::TrainingSampleSet::~TrainingSampleSet ( )

Definition at line 75 of file trainingsampleset.cpp.

75  {
76  delete font_class_array_;
77 }

Member Function Documentation

◆ AddAllFontsForClass()

void tesseract::TrainingSampleSet::AddAllFontsForClass ( int  class_id,
Shape shape 
) const

Definition at line 831 of file trainingsampleset.cpp.

831  {
832  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
833  int font_id = font_id_map_.CompactToSparse(f);
834  shape->AddToShape(class_id, font_id);
835  }
836 }
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
int CompactSize() const
Definition: indexmapbidi.h:61

◆ AddSample() [1/2]

int tesseract::TrainingSampleSet::AddSample ( const char *  unichar,
TrainingSample sample 
)

Definition at line 129 of file trainingsampleset.cpp.

129  {
130  if (!unicharset_.contains_unichar(unichar)) {
131  unicharset_.unichar_insert(unichar);
132  if (unicharset_.size() > MAX_NUM_CLASSES) {
133  tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
134  "greater than MAX_NUM_CLASSES\n");
135  return -1;
136  }
137  }
138  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
139  AddSample(char_id, sample);
140  return char_id;
141 }
bool TESS_API contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int size() const
Definition: unicharset.h:297
int AddSample(const char *unichar, TrainingSample *sample)
void TESS_API unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
Definition: cluster.h:32
int UNICHAR_ID
Definition: unichar.h:33

◆ AddSample() [2/2]

void tesseract::TrainingSampleSet::AddSample ( int  unichar_id,
TrainingSample sample 
)

Definition at line 145 of file trainingsampleset.cpp.

145  {
146  sample->set_class_id(unichar_id);
147  samples_.push_back(sample);
148  num_raw_samples_ = samples_.size();
149  unicharset_size_ = unicharset_.size();
150 }
int size() const
Definition: unicharset.h:297
Definition: cluster.h:32

◆ charsetsize()

int tesseract::TrainingSampleSet::charsetsize ( ) const
inline

Definition at line 67 of file trainingsampleset.h.

67  {
68  return unicharset_size_;
69  }

◆ ClusterDistance()

float tesseract::TrainingSampleSet::ClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
)

Definition at line 296 of file trainingsampleset.cpp.

298  {
299  ASSERT_HOST(font_class_array_ != NULL);
300  int font_index1 = font_id_map_.SparseToCompact(font_id1);
301  int font_index2 = font_id_map_.SparseToCompact(font_id2);
302  if (font_index1 < 0 || font_index2 < 0)
303  return 0.0f;
304  FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
305  if (font_id1 == font_id2) {
306  // Special case cache for speed.
307  if (fc_info.unichar_distance_cache.size() == 0)
308  fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
309  if (fc_info.unichar_distance_cache[class_id2] < 0) {
310  // Distance has to be calculated.
311  float result = ComputeClusterDistance(font_id1, class_id1,
312  font_id2, class_id2,
313  feature_map);
314  fc_info.unichar_distance_cache[class_id2] = result;
315  // Copy to the symmetric cache entry.
316  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
317  if (fc_info2.unichar_distance_cache.size() == 0)
318  fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
319  fc_info2.unichar_distance_cache[class_id1] = result;
320  }
321  return fc_info.unichar_distance_cache[class_id2];
322  } else if (class_id1 == class_id2) {
323  // Another special-case cache for equal class-id.
324  if (fc_info.font_distance_cache.size() == 0)
325  fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
326  -1.0f);
327  if (fc_info.font_distance_cache[font_index2] < 0) {
328  // Distance has to be calculated.
329  float result = ComputeClusterDistance(font_id1, class_id1,
330  font_id2, class_id2,
331  feature_map);
332  fc_info.font_distance_cache[font_index2] = result;
333  // Copy to the symmetric cache entry.
334  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
335  if (fc_info2.font_distance_cache.size() == 0)
336  fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
337  -1.0f);
338  fc_info2.font_distance_cache[font_index1] = result;
339  }
340  return fc_info.font_distance_cache[font_index2];
341  }
342  // Both font and class are different. Linear search for class_id2/font_id2
343  // in what is a hopefully short list of distances.
344  int cache_index = 0;
345  while (cache_index < fc_info.distance_cache.size() &&
346  (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
347  fc_info.distance_cache[cache_index].font_id != font_id2))
348  ++cache_index;
349  if (cache_index == fc_info.distance_cache.size()) {
350  // Distance has to be calculated.
351  float result = ComputeClusterDistance(font_id1, class_id1,
352  font_id2, class_id2,
353  feature_map);
354  FontClassDistance fc_dist = { class_id2, font_id2, result };
355  fc_info.distance_cache.push_back(fc_dist);
356  // Copy to the symmetric cache entry. We know it isn't there already, as
357  // we always copy to the symmetric entry.
358  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
359  fc_dist.unichar_id = class_id1;
360  fc_dist.font_id = font_id1;
361  fc_info2.distance_cache.push_back(fc_dist);
362  }
363  return fc_info.distance_cache[cache_index].distance;
364 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ ComputeCanonicalFeatures()

void tesseract::TrainingSampleSet::ComputeCanonicalFeatures ( )

Definition at line 791 of file trainingsampleset.cpp.

791  {
792  ASSERT_HOST(font_class_array_ != NULL);
793  int font_size = font_id_map_.CompactSize();
794  for (int font_index = 0; font_index < font_size; ++font_index) {
795  int font_id = font_id_map_.CompactToSparse(font_index);
796  for (int c = 0; c < unicharset_size_; ++c) {
797  int num_samples = NumClassSamples(font_id, c, false);
798  if (num_samples == 0)
799  continue;
800  const TrainingSample* sample = GetCanonicalSample(font_id, c);
801  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
802  fcinfo.canonical_features = sample->indexed_features();
803  }
804  }
805 }
int NumClassSamples(int font_id, int class_id, bool randomize) const
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: cluster.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ ComputeCanonicalSamples()

void tesseract::TrainingSampleSet::ComputeCanonicalSamples ( const IntFeatureMap map,
bool  debug 
)

Definition at line 660 of file trainingsampleset.cpp.

661  {
662  ASSERT_HOST(font_class_array_ != NULL);
663  IntFeatureDist f_table;
664  if (debug) tprintf("feature table size %d\n", map.sparse_size());
665  f_table.Init(&map);
666  int worst_s1 = 0;
667  int worst_s2 = 0;
668  double global_worst_dist = 0.0;
669  // Compute distances independently for each font and char index.
670  int font_size = font_id_map_.CompactSize();
671  for (int font_index = 0; font_index < font_size; ++font_index) {
672  int font_id = font_id_map_.CompactToSparse(font_index);
673  for (int c = 0; c < unicharset_size_; ++c) {
674  int samples_found = 0;
675  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
676  if (fcinfo.samples.size() == 0 ||
677  (kTestChar >= 0 && c != kTestChar)) {
678  fcinfo.canonical_sample = -1;
679  fcinfo.canonical_dist = 0.0f;
680  if (debug) tprintf("Skipping class %d\n", c);
681  continue;
682  }
683  // The canonical sample will be the one with the min_max_dist, which
684  // is the sample with the lowest maximum distance to all other samples.
685  double min_max_dist = 2.0;
686  // We keep track of the farthest apart pair (max_s1, max_s2) which
687  // are max_max_dist apart, so we can see how bad the variability is.
688  double max_max_dist = 0.0;
689  int max_s1 = 0;
690  int max_s2 = 0;
691  fcinfo.canonical_sample = fcinfo.samples[0];
692  fcinfo.canonical_dist = 0.0f;
693  for (int i = 0; i < fcinfo.samples.size(); ++i) {
694  int s1 = fcinfo.samples[i];
695  const GenericVector<int>& features1 = samples_[s1]->indexed_features();
696  f_table.Set(features1, features1.size(), true);
697  double max_dist = 0.0;
698  // Run the full squared-order search for similar samples. It is still
699  // reasonably fast because f_table.FeatureDistance is fast, but we
700  // may have to reconsider if we start playing with too many samples
701  // of a single char/font.
702  for (int j = 0; j < fcinfo.samples.size(); ++j) {
703  int s2 = fcinfo.samples[j];
704  if (samples_[s2]->class_id() != c ||
705  samples_[s2]->font_id() != font_id ||
706  s2 == s1)
707  continue;
708  GenericVector<int> features2 = samples_[s2]->indexed_features();
709  double dist = f_table.FeatureDistance(features2);
710  if (dist > max_dist) {
711  max_dist = dist;
712  if (dist > max_max_dist) {
713  max_s1 = s1;
714  max_s2 = s2;
715  }
716  }
717  }
718  // Using Set(..., false) is far faster than re initializing, due to
719  // the sparseness of the feature space.
720  f_table.Set(features1, features1.size(), false);
721  samples_[s1]->set_max_dist(max_dist);
722  ++samples_found;
723  if (max_dist < min_max_dist) {
724  fcinfo.canonical_sample = s1;
725  fcinfo.canonical_dist = max_dist;
726  }
727  UpdateRange(max_dist, &min_max_dist, &max_max_dist);
728  }
729  if (max_max_dist > global_worst_dist) {
730  // Keep a record of the worst pair over all characters/fonts too.
731  global_worst_dist = max_max_dist;
732  worst_s1 = max_s1;
733  worst_s2 = max_s2;
734  }
735  if (debug) {
736  tprintf("Found %d samples of class %d=%s, font %d, "
737  "dist range [%g, %g], worst pair= %s, %s\n",
738  samples_found, c, unicharset_.debug_str(c).string(),
739  font_index, min_max_dist, max_max_dist,
740  SampleToString(*samples_[max_s1]).string(),
741  SampleToString(*samples_[max_s2]).string());
742  }
743  }
744  }
745  if (debug) {
746  tprintf("Global worst dist = %g, between sample %d and %d\n",
747  global_worst_dist, worst_s1, worst_s2);
748  Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
749  Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
750  pixOr(pix1, pix1, pix2);
751  pixWrite("worstpair.png", pix1, IFF_PNG);
752  pixDestroy(&pix1);
753  pixDestroy(&pix2);
754  }
755 }
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
STRING SampleToString(const TrainingSample &sample) const
const char * string() const
Definition: strngs.cpp:201
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
const int kTestChar
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ ComputeCloudFeatures()

void tesseract::TrainingSampleSet::ComputeCloudFeatures ( int  feature_space_size)

Definition at line 809 of file trainingsampleset.cpp.

809  {
810  ASSERT_HOST(font_class_array_ != NULL);
811  int font_size = font_id_map_.CompactSize();
812  for (int font_index = 0; font_index < font_size; ++font_index) {
813  int font_id = font_id_map_.CompactToSparse(font_index);
814  for (int c = 0; c < unicharset_size_; ++c) {
815  int num_samples = NumClassSamples(font_id, c, false);
816  if (num_samples == 0)
817  continue;
818  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
819  fcinfo.cloud_features.Init(feature_space_size);
820  for (int s = 0; s < num_samples; ++s) {
821  const TrainingSample* sample = GetSample(font_id, c, s);
822  const GenericVector<int>& sample_features = sample->indexed_features();
823  for (int i = 0; i < sample_features.size(); ++i)
824  fcinfo.cloud_features.SetBit(sample_features[i]);
825  }
826  }
827  }
828 }
int NumClassSamples(int font_id, int class_id, bool randomize) const
const TrainingSample * GetSample(int index) const
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
int size() const
Definition: genericvector.h:72
Definition: cluster.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ ComputeClusterDistance()

float tesseract::TrainingSampleSet::ComputeClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
) const

Definition at line 367 of file trainingsampleset.cpp.

369  {
370  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
371  feature_map, false);
372  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
373  feature_map, false);
374  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
375  denominator += GetCanonicalFeatures(font_id2, class_id2).size();
376  return static_cast<float>(dist) / denominator;
377 }
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
int size() const
Definition: genericvector.h:72

◆ DeleteableSample()

bool tesseract::TrainingSampleSet::DeleteableSample ( const TrainingSample sample)

Definition at line 581 of file trainingsampleset.cpp.

581  {
582  return sample == NULL || sample->class_id() < 0;
583 }
Definition: cluster.h:32

◆ DeleteDeadSamples()

void tesseract::TrainingSampleSet::DeleteDeadSamples ( )

Definition at line 572 of file trainingsampleset.cpp.

572  {
573  samples_.compact(
575  num_raw_samples_ = samples_.size();
576  // Samples must be re-organized now we have deleted a few.
577 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool DeleteableSample(const TrainingSample *sample)

◆ DeleteOutliers()

void tesseract::TrainingSampleSet::DeleteOutliers ( const IntFeatureSpace feature_space,
bool  debug 
)

Definition at line 492 of file trainingsampleset.cpp.

493  {
494  if (font_class_array_ == NULL)
496  Pixa* pixa = NULL;
497  if (debug)
498  pixa = pixaCreate(0);
499  GenericVector<int> feature_counts;
500  int fs_size = feature_space.Size();
501  int font_size = font_id_map_.CompactSize();
502  for (int font_index = 0; font_index < font_size; ++font_index) {
503  for (int c = 0; c < unicharset_size_; ++c) {
504  // Create a histogram of the features used by all samples of this
505  // font/class combination.
506  feature_counts.init_to_size(fs_size, 0);
507  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
508  int sample_count = fcinfo.samples.size();
509  if (sample_count < kMinOutlierSamples)
510  continue;
511  for (int i = 0; i < sample_count; ++i) {
512  int s = fcinfo.samples[i];
513  const GenericVector<int>& features = samples_[s]->indexed_features();
514  for (int f = 0; f < features.size(); ++f) {
515  ++feature_counts[features[f]];
516  }
517  }
518  for (int i = 0; i < sample_count; ++i) {
519  int s = fcinfo.samples[i];
520  const TrainingSample& sample = *samples_[s];
521  const GenericVector<int>& features = sample.indexed_features();
522  // A feature that has a histogram count of 1 is only used by this
523  // sample, making it 'bad'. All others are 'good'.
524  int good_features = 0;
525  int bad_features = 0;
526  for (int f = 0; f < features.size(); ++f) {
527  if (feature_counts[features[f]] > 1)
528  ++good_features;
529  else
530  ++bad_features;
531  }
532  // If more than 1/3 features are bad, then this is an outlier.
533  if (bad_features * 2 > good_features) {
534  tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
535  SampleToString(sample).string(),
536  good_features, bad_features);
537  if (debug) {
538  pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
539  // Add the previous sample as well, so it is easier to see in
540  // the output what is wrong with this sample.
541  int t;
542  if (i == 0)
543  t = fcinfo.samples[1];
544  else
545  t = fcinfo.samples[i - 1];
546  const TrainingSample &csample = *samples_[t];
547  pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
548  }
549  // Mark the sample for deletion.
550  KillSample(samples_[s]);
551  }
552  }
553  }
554  }
555  // Truly delete all bad samples and renumber everything.
557  if (pixa != NULL) {
558  Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
559  pixaDestroy(&pixa);
560  pixWrite("outliers.png", pix, IFF_PNG);
561  pixDestroy(&pix);
562  }
563 }
STRING SampleToString(const TrainingSample &sample) const
#define tprintf(...)
Definition: tprintf.h:31
void KillSample(TrainingSample *sample)
int size() const
Definition: genericvector.h:72
const int kMinOutlierSamples
Definition: cluster.h:32
void init_to_size(int size, T t)
int CompactSize() const
Definition: indexmapbidi.h:61

◆ DeSerialize()

bool tesseract::TrainingSampleSet::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 94 of file trainingsampleset.cpp.

94  {
95  if (!samples_.DeSerialize(swap, fp)) return false;
96  num_raw_samples_ = samples_.size();
97  if (!unicharset_.load_from_file(fp)) return false;
98  if (!font_id_map_.DeSerialize(swap, fp)) return false;
99  delete font_class_array_;
100  font_class_array_ = NULL;
101  inT8 not_null;
102  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
103  if (not_null) {
104  FontClassInfo empty;
105  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
106  if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
107  }
108  unicharset_size_ = unicharset_.size();
109  return true;
110 }
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: matrix.h:195
int size() const
Definition: unicharset.h:297
SIGNED char inT8
Definition: host.h:31
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
bool DeSerialize(bool swap, FILE *fp)

◆ DisplaySamplesWithFeature()

void tesseract::TrainingSampleSet::DisplaySamplesWithFeature ( int  f_index,
const Shape shape,
const IntFeatureSpace feature_space,
ScrollView::Color  color,
ScrollView window 
) const

Definition at line 840 of file trainingsampleset.cpp.

844  {
845  for (int s = 0; s < num_raw_samples(); ++s) {
846  const TrainingSample* sample = GetSample(s);
847  if (shape.ContainsUnichar(sample->class_id())) {
848  GenericVector<int> indexed_features;
849  space.IndexAndSortFeatures(sample->features(), sample->num_features(),
850  &indexed_features);
851  for (int f = 0; f < indexed_features.size(); ++f) {
852  if (indexed_features[f] == f_index) {
853  sample->DisplayFeatures(color, window);
854  }
855  }
856  }
857  }
858 }
const TrainingSample * GetSample(int index) const
int size() const
Definition: genericvector.h:72
Definition: cluster.h:32

◆ extract_sample()

TrainingSample* tesseract::TrainingSampleSet::extract_sample ( int  index)
inline

Definition at line 165 of file trainingsampleset.h.

165  {
166  TrainingSample* sample = samples_[index];
167  samples_[index] = NULL;
168  return sample;
169  }
Definition: cluster.h:32

◆ fontinfo_table()

const FontInfoTable& tesseract::TrainingSampleSet::fontinfo_table ( ) const
inline

Definition at line 70 of file trainingsampleset.h.

70  {
71  return fontinfo_table_;
72  }

◆ GetCanonicalDist()

float tesseract::TrainingSampleSet::GetCanonicalDist ( int  font_id,
int  class_id 
) const

Definition at line 474 of file trainingsampleset.cpp.

474  {
475  ASSERT_HOST(font_class_array_ != NULL);
476  int font_index = font_id_map_.SparseToCompact(font_id);
477  if (font_index < 0) return 0.0f;
478  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
479  return (*font_class_array_)(font_index, class_id).canonical_dist;
480  else
481  return 0.0f;
482 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetCanonicalFeatures()

const GenericVector< int > & tesseract::TrainingSampleSet::GetCanonicalFeatures ( int  font_id,
int  class_id 
) const

Definition at line 219 of file trainingsampleset.cpp.

220  {
221  int font_index = font_id_map_.SparseToCompact(font_id);
222  ASSERT_HOST(font_index >= 0);
223  return (*font_class_array_)(font_index, class_id).canonical_features;
224 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetCanonicalSample()

const TrainingSample * tesseract::TrainingSampleSet::GetCanonicalSample ( int  font_id,
int  class_id 
) const

Definition at line 462 of file trainingsampleset.cpp.

463  {
464  ASSERT_HOST(font_class_array_ != NULL);
465  int font_index = font_id_map_.SparseToCompact(font_id);
466  if (font_index < 0) return NULL;
467  int sample_index = (*font_class_array_)(font_index,
468  class_id).canonical_sample;
469  return sample_index >= 0 ? samples_[sample_index] : NULL;
470 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetCloudFeatures()

const BitVector & tesseract::TrainingSampleSet::GetCloudFeatures ( int  font_id,
int  class_id 
) const

Definition at line 211 of file trainingsampleset.cpp.

212  {
213  int font_index = font_id_map_.SparseToCompact(font_id);
214  ASSERT_HOST(font_index >= 0);
215  return (*font_class_array_)(font_index, class_id).cloud_features;
216 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetSample() [1/2]

const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  index) const

Definition at line 174 of file trainingsampleset.cpp.

174  {
175  return samples_[index];
176 }

◆ GetSample() [2/2]

const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 180 of file trainingsampleset.cpp.

181  {
182  ASSERT_HOST(font_class_array_ != NULL);
183  int font_index = font_id_map_.SparseToCompact(font_id);
184  if (font_index < 0) return NULL;
185  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
186  return samples_[sample_index];
187 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GlobalSampleIndex()

int tesseract::TrainingSampleSet::GlobalSampleIndex ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 452 of file trainingsampleset.cpp.

453  {
454  ASSERT_HOST(font_class_array_ != NULL);
455  int font_index = font_id_map_.SparseToCompact(font_id);
456  if (font_index < 0) return -1;
457  return (*font_class_array_)(font_index, class_id).samples[index];
458 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ IndexFeatures()

void tesseract::TrainingSampleSet::IndexFeatures ( const IntFeatureSpace feature_space)

Definition at line 485 of file trainingsampleset.cpp.

485  {
486  for (int s = 0; s < samples_.size(); ++s)
487  samples_[s]->IndexFeatures(feature_space);
488 }
void IndexFeatures(const IntFeatureSpace &feature_space)

◆ KillSample()

void tesseract::TrainingSampleSet::KillSample ( TrainingSample sample)

Definition at line 567 of file trainingsampleset.cpp.

567  {
568  sample->set_sample_index(-1);
569 }
Definition: cluster.h:32

◆ LoadUnicharset()

void tesseract::TrainingSampleSet::LoadUnicharset ( const char *  filename)

Definition at line 113 of file trainingsampleset.cpp.

113  {
114  if (!unicharset_.load_from_file(filename)) {
115  tprintf("Failed to load unicharset from file %s\n"
116  "Building unicharset from scratch...\n",
117  filename);
118  unicharset_.clear();
119  // Add special characters as they were removed by the clear.
120  UNICHARSET empty;
121  unicharset_.AppendOtherUnicharset(empty);
122  }
123  unicharset_size_ = unicharset_.size();
124 }
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
int size() const
Definition: unicharset.h:297
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void clear()
Definition: unicharset.h:265
#define tprintf(...)
Definition: tprintf.h:31

◆ mutable_sample()

TrainingSample* tesseract::TrainingSampleSet::mutable_sample ( int  index)
inline

Definition at line 161 of file trainingsampleset.h.

161  {
162  return samples_[index];
163  }

◆ MutableSample()

TrainingSample * tesseract::TrainingSampleSet::MutableSample ( int  font_id,
int  class_id,
int  index 
)

Definition at line 191 of file trainingsampleset.cpp.

192  {
193  ASSERT_HOST(font_class_array_ != NULL);
194  int font_index = font_id_map_.SparseToCompact(font_id);
195  if (font_index < 0) return NULL;
196  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
197  return samples_[sample_index];
198 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ num_raw_samples()

int tesseract::TrainingSampleSet::num_raw_samples ( ) const
inline

Definition at line 58 of file trainingsampleset.h.

58  {
59  return num_raw_samples_;
60  }

◆ num_samples()

int tesseract::TrainingSampleSet::num_samples ( ) const
inline

Definition at line 55 of file trainingsampleset.h.

55  {
56  return samples_.size();
57  }

◆ NumClassSamples()

int tesseract::TrainingSampleSet::NumClassSamples ( int  font_id,
int  class_id,
bool  randomize 
) const

Definition at line 156 of file trainingsampleset.cpp.

157  {
158  ASSERT_HOST(font_class_array_ != NULL);
159  if (font_id < 0 || class_id < 0 ||
160  font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
161  // There are no samples because the font or class doesn't exist.
162  return 0;
163  }
164  int font_index = font_id_map_.SparseToCompact(font_id);
165  if (font_index < 0)
166  return 0; // The font has no samples.
167  if (randomize)
168  return (*font_class_array_)(font_index, class_id).samples.size();
169  else
170  return (*font_class_array_)(font_index, class_id).num_raw_samples;
171 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
virtual int SparseSize() const
Definition: indexmapbidi.h:142
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ NumFonts()

int tesseract::TrainingSampleSet::NumFonts ( ) const
inline

Definition at line 61 of file trainingsampleset.h.

61  {
62  return font_id_map_.SparseSize();
63  }
virtual int SparseSize() const
Definition: indexmapbidi.h:142

◆ OrganizeByFontAndClass()

void tesseract::TrainingSampleSet::OrganizeByFontAndClass ( )

Definition at line 602 of file trainingsampleset.cpp.

602  {
603  // Font indexes are sparse, so we used a map to compact them, so we can
604  // have an efficient 2-d array of fonts and character classes.
605  SetupFontIdMap();
606  int compact_font_size = font_id_map_.CompactSize();
607  // Get a 2-d array of generic vectors.
608  if (font_class_array_ != NULL)
609  delete font_class_array_;
610  FontClassInfo empty;
611  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
612  compact_font_size, unicharset_size_, empty);
613  for (int s = 0; s < samples_.size(); ++s) {
614  int font_id = samples_[s]->font_id();
615  int class_id = samples_[s]->class_id();
616  if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
617  tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
618  font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
619  s);
620  }
621  ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
622  ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
623  int font_index = font_id_map_.SparseToCompact(font_id);
624  (*font_class_array_)(font_index, class_id).samples.push_back(s);
625  }
626  // Set the num_raw_samples member of the FontClassInfo, to set the boundary
627  // between the raw samples and the replicated ones.
628  for (int f = 0; f < compact_font_size; ++f) {
629  for (int c = 0; c < unicharset_size_; ++c)
630  (*font_class_array_)(f, c).num_raw_samples =
631  (*font_class_array_)(f, c).samples.size();
632  }
633  // This is the global number of samples and also marks the boundary between
634  // real and replicated samples.
635  num_raw_samples_ = samples_.size();
636 }
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
virtual int SparseSize() const
Definition: indexmapbidi.h:142
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ ReliablySeparable()

int tesseract::TrainingSampleSet::ReliablySeparable ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map,
bool  thorough 
) const

Definition at line 413 of file trainingsampleset.cpp.

416  {
417  int result = 0;
418  const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
419  if (sample2 == NULL)
420  return 0; // There are no canonical features.
421  const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
422  class_id2);
423  const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
424  if (cloud1.size() == 0)
425  return canonical2.size(); // There are no cloud features.
426 
427  // Find a canonical2 feature that is not in cloud1.
428  for (int f = 0; f < canonical2.size(); ++f) {
429  int feature = canonical2[f];
430  if (cloud1[feature])
431  continue;
432  // Gather the near neighbours of f.
433  GenericVector<int> good_features;
434  AddNearFeatures(feature_map, feature, 1, &good_features);
435  // Check that none of the good_features are in the cloud.
436  int i;
437  for (i = 0; i < good_features.size(); ++i) {
438  int good_f = good_features[i];
439  if (cloud1[good_f]) {
440  break;
441  }
442  }
443  if (i < good_features.size())
444  continue; // Found one in the cloud.
445  ++result;
446  }
447  return result;
448 }
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
int size() const
Definition: genericvector.h:72

◆ ReplicateAndRandomizeSamples()

void tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples ( )

Definition at line 762 of file trainingsampleset.cpp.

762  {
763  ASSERT_HOST(font_class_array_ != NULL);
764  int font_size = font_id_map_.CompactSize();
765  for (int font_index = 0; font_index < font_size; ++font_index) {
766  for (int c = 0; c < unicharset_size_; ++c) {
767  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
768  int sample_count = fcinfo.samples.size();
769  int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
770  if (sample_count > 0 && sample_count < min_samples) {
771  int base_count = sample_count;
772  for (int base_index = 0; sample_count < min_samples; ++sample_count) {
773  int src_index = fcinfo.samples[base_index++];
774  if (base_index >= base_count) base_index = 0;
775  TrainingSample* sample = samples_[src_index]->RandomizedCopy(
776  sample_count % kSampleRandomSize);
777  int sample_index = samples_.size();
778  sample->set_sample_index(sample_index);
779  samples_.push_back(sample);
780  fcinfo.samples.push_back(sample_index);
781  }
782  }
783  }
784  }
785 }
#define MAX(x, y)
Definition: ndminx.h:24
Definition: cluster.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:84
int CompactSize() const
Definition: indexmapbidi.h:61

◆ SampleToString()

STRING tesseract::TrainingSampleSet::SampleToString ( const TrainingSample sample) const

Definition at line 202 of file trainingsampleset.cpp.

202  {
203  STRING boxfile_str;
204  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
205  sample.bounding_box(), sample.page_num(), &boxfile_str);
206  return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
207 }
T & get(int index) const
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
Definition: boxread.cpp:226
Definition: strngs.h:44
Definition: cluster.h:32
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

◆ Serialize()

bool tesseract::TrainingSampleSet::Serialize ( FILE *  fp) const

Definition at line 80 of file trainingsampleset.cpp.

80  {
81  if (!samples_.Serialize(fp)) return false;
82  if (!unicharset_.save_to_file(fp)) return false;
83  if (!font_id_map_.Serialize(fp)) return false;
84  inT8 not_null = font_class_array_ != NULL;
85  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
86  if (not_null) {
87  if (!font_class_array_->SerializeClasses(fp)) return false;
88  }
89  return true;
90 }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
bool SerializeClasses(FILE *fp) const
Definition: matrix.h:182
SIGNED char inT8
Definition: host.h:31
bool Serialize(FILE *fp) const

◆ SetupFontIdMap()

void tesseract::TrainingSampleSet::SetupFontIdMap ( )

Definition at line 640 of file trainingsampleset.cpp.

640  {
641  // Number of samples for each font_id.
642  GenericVector<int> font_counts;
643  for (int s = 0; s < samples_.size(); ++s) {
644  int font_id = samples_[s]->font_id();
645  while (font_id >= font_counts.size())
646  font_counts.push_back(0);
647  ++font_counts[font_id];
648  }
649  font_id_map_.Init(font_counts.size(), false);
650  for (int f = 0; f < font_counts.size(); ++f) {
651  font_id_map_.SetMap(f, font_counts[f] > 0);
652  }
653  font_id_map_.Setup();
654 }
int push_back(T object)
int size() const
Definition: genericvector.h:72
void Init(int size, bool all_mapped)
void SetMap(int sparse_index, bool mapped)

◆ UnicharDistance()

float tesseract::TrainingSampleSet::UnicharDistance ( const UnicharAndFonts uf1,
const UnicharAndFonts uf2,
bool  matched_fonts,
const IntFeatureMap feature_map 
)

Definition at line 230 of file trainingsampleset.cpp.

233  {
234  int num_fonts1 = uf1.font_ids.size();
235  int c1 = uf1.unichar_id;
236  int num_fonts2 = uf2.font_ids.size();
237  int c2 = uf2.unichar_id;
238  double dist_sum = 0.0;
239  int dist_count = 0;
240  bool debug = false;
241  if (matched_fonts) {
242  // Compute distances only where fonts match.
243  for (int i = 0; i < num_fonts1; ++i) {
244  int f1 = uf1.font_ids[i];
245  for (int j = 0; j < num_fonts2; ++j) {
246  int f2 = uf2.font_ids[j];
247  if (f1 == f2) {
248  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
249  ++dist_count;
250  }
251  }
252  }
253  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
254  // Small enough sets to compute all the distances.
255  for (int i = 0; i < num_fonts1; ++i) {
256  int f1 = uf1.font_ids[i];
257  for (int j = 0; j < num_fonts2; ++j) {
258  int f2 = uf2.font_ids[j];
259  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
260  if (debug) {
261  tprintf("Cluster dist %d %d %d %d = %g\n",
262  f1, c1, f2, c2,
263  ClusterDistance(f1, c1, f2, c2, feature_map));
264  }
265  ++dist_count;
266  }
267  }
268  } else {
269  // Subsample distances, using the largest set once, and stepping through
270  // the smaller set so as to ensure that all the pairs are different.
271  int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
272  int index = 0;
273  int num_samples = MAX(num_fonts1, num_fonts2);
274  for (int i = 0; i < num_samples; ++i, index += increment) {
275  int f1 = uf1.font_ids[i % num_fonts1];
276  int f2 = uf2.font_ids[index % num_fonts2];
277  if (debug) {
278  tprintf("Cluster dist %d %d %d %d = %g\n",
279  f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
280  }
281  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
282  ++dist_count;
283  }
284  }
285  if (dist_count == 0) {
286  if (matched_fonts)
287  return UnicharDistance(uf1, uf2, false, feature_map);
288  return 0.0f;
289  }
290  return dist_sum / dist_count;
291 }
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
const int kPrime1
const int kSquareLimit
#define MAX(x, y)
Definition: ndminx.h:24
#define tprintf(...)
Definition: tprintf.h:31
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
const int kPrime2

◆ unicharset()

const UNICHARSET& tesseract::TrainingSampleSet::unicharset ( ) const
inline

Definition at line 64 of file trainingsampleset.h.

64  {
65  return unicharset_;
66  }

The documentation for this class was generated from the following files: