tesseract  3.05.02
sampleiterator.h
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 
17 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
18 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
19 
20 namespace tesseract {
21 
22 class IndexMapBiDi;
23 class IntFeatureMap;
24 class ShapeTable;
25 class TrainingSample;
26 class TrainingSampleSet;
27 struct UnicharAndFonts;
28 
29 // Iterator class to encapsulate the complex iteration involved in getting
30 // all samples of all shapes needed for a classification problem.
31 //
32 // =====INPUTS TO Init FUNCTION=====
33 // The charset_map defines a subset of the sample_set classes (with a NULL
34 // shape_table, or the shape_table classes if not NULL.)
35 //
36 // The shape_table (if not NULL) defines the mapping from shapes to
37 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
38 //
39 // The sample_set holds the samples and provides indexed access to samples
40 // of font_id/class_id pairs.
41 //
42 // If randomize is true, the samples are perturbed slightly, but the
43 // perturbation is guaranteed to be the same for multiple identical
44 // iterations.
45 //
46 // =====DIFFERENT COMBINATIONS OF INPUTS=====
47 // NULL shape_table:
48 // Without a shape_table, everything works in UNICHAR_IDs.
49 //
50 // NULL shape_table, NULL charset_map:
51 // Iterations simply run over the samples in the order the samples occur in the
52 // input files.
53 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
54 //
55 // NULL shape_table, non-NULL charset_map:
56 // When shape_table is NULL, the charset_map indexes unichar_ids directly,
57 // and an iteration returns all samples of all chars in the charset_map, which
58 // is a subset of the full unicharset.
59 // The iteration will be in groups of the same unichar_id, in the order
60 // defined by the charset_map.
61 // GetCompactClassID returns the charset_map index of a sample, and
62 // GetSparseClassID returns the sample UNICHAR_ID.
63 //
64 // Non-NULL shape_table:
65 // With a shape_table, samples are grouped according to the shape_table, so
66 // multiple UNICHAR_IDs and fonts may be grouped together, and everything
67 // works in shape_ids.
68 //
69 // Non-NULL shape_table, NULL charset_map.
70 // Iterations simply run over the samples in the order of shape_id.
71 // GetCompactClassID and GetSparseClassID both return the shape_id.
72 // (If you want the unichar_id or font_id, the sample still has them.)
73 //
74 // Non-NULL shape_table, non-NULL charset_map.
75 // When shape_table is not NULL, the charset_map indexes and subsets shapes in
76 // the shape_table, and iterations will be in shape_table order, not
77 // charset_map order.
78 // GetCompactClassID returns the charset_map index of a shape, and
79 // GetSparseClassID returns the shape_id.
80 //
81 // =====What is SampleIterator good for?=====
82 // Inside a classifier training module, the SampleIterator has abstracted away
83 // all the different modes above.
84 // Use the following iteration to train your classifier:
85 // for (it.Begin(); !it.AtEnd(); it.Next()) {
86 // const TrainingSample& sample = it.GetSample();
87 // int class_id = it.GetCompactClassID();
88 // Your classifier may or may not be dealing with a shape_table, and may be
89 // dealing with some subset of the character/shape set. It doesn't need to
90 // know and shouldn't care. It is just learning shapes with compact class ids
91 // in the range [0, it.CompactCharsetSize()).
93  public:
96 
97  void Clear();
98 
99  // See class comment for arguments.
100  void Init(const IndexMapBiDi* charset_map,
101  const ShapeTable* shape_table,
102  bool randomize,
104 
105  // Iterator functions designed for use with a simple for loop:
106  // for (it.Begin(); !it.AtEnd(); it.Next()) {
107  // const TrainingSample& sample = it.GetSample();
108  // int class_id = it.GetCompactClassID();
109  // ...
110  // }
111  void Begin();
112  bool AtEnd() const;
113  const TrainingSample& GetSample() const;
114  TrainingSample* MutableSample() const;
115  // Returns the total index (from the original set of samples) of the current
116  // sample.
117  int GlobalSampleIndex() const;
118  // Returns the index of the current sample in compact charset space, so
119  // in a 2-class problem between x and y, the returned indices will all be
120  // 0 or 1, and have nothing to do with the unichar_ids.
121  // If the charset_map_ is NULL, then this is equal to GetSparseClassID().
122  int GetCompactClassID() const;
123  // Returns the index of the current sample in sparse charset space, so
124  // in a 2-class problem between x and y, the returned indices will all be
125  // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
126  // with a shape_table_.
127  int GetSparseClassID() const;
128  // Moves on to the next indexable sample. If the end is reached, leaves
129  // the state such that AtEnd() is true.
130  void Next();
131 
132  // Returns the size of the compact charset space.
133  int CompactCharsetSize() const;
134  // Returns the size of the sparse charset space.
135  int SparseCharsetSize() const;
136 
137  const IndexMapBiDi& charset_map() const {
138  return *charset_map_;
139  }
140  const ShapeTable* shape_table() const {
141  return shape_table_;
142  }
143  // Sample set operations.
144  const TrainingSampleSet* sample_set() const {
145  return sample_set_;
146  }
147 
148  // A set of functions that do something to all the samples accessed by the
149  // iterator, as it is currently setup.
150 
151  // Apply the supplied feature_space/feature_map transform to all samples
152  // accessed by this iterator.
153  void MapSampleFeatures(const IntFeatureMap& feature_map);
154 
155  // Adjust the weights of all the samples to be uniform in the given charset.
156  // Returns the number of samples in the iterator.
157  int UniformSamples();
158 
159  // Normalize the weights of all the samples defined by the iterator so they
160  // sum to 1. Returns the minimum assigned sample weight.
161  double NormalizeSamples();
162 
163  private:
164  // Helper returns the current UnicharAndFont shape_entry.
165  const UnicharAndFonts* GetShapeEntry() const;
166 
167  // Map to subset the actual charset space.
168  const IndexMapBiDi* charset_map_;
169  // Shape table to recombine character classes into shapes
170  const ShapeTable* shape_table_;
171  // The samples to iterate over.
172  TrainingSampleSet* sample_set_;
173  // Flag to control randomizing the sample features.
174  bool randomize_;
175  // Shape table owned by this used to iterate character classes.
176  ShapeTable* owned_shape_table_;
177 
178  // Top-level iteration. Shape index in sparse charset_map space.
179  int shape_index_;
180  int num_shapes_;
181  // Index to the character class within a shape.
182  int shape_char_index_;
183  int num_shape_chars_;
184  // Index to the font within a shape/class pair.
185  int shape_font_index_;
186  int num_shape_fonts_;
187  // The lowest level iteration. sample_index_/num_samples_ counts samples
188  // in the current shape/class/font combination.
189  int sample_index_;
190  int num_samples_;
191 };
192 
193 } // namespace tesseract.
194 
195 #endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
const ShapeTable * shape_table() const
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
void MapSampleFeatures(const IntFeatureMap &feature_map)
const TrainingSample & GetSample() const
TrainingSample * MutableSample() const
const TrainingSampleSet * sample_set() const
const IndexMapBiDi & charset_map() const