tesseract  3.05.02
classify.cpp
Go to the documentation of this file.
1 // File: classify.cpp
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "classify.h"
25 #include "fontinfo.h"
26 #include "intproto.h"
27 #include "mfoutline.h"
28 #include "scrollview.h"
29 #include "shapeclassifier.h"
30 #include "shapetable.h"
31 #include "unicity_table.h"
32 #include <string.h>
33 
34 namespace tesseract {
36  : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
37  this->params()),
38  BOOL_MEMBER(prioritize_division, FALSE,
39  "Prioritize blob division over chopping", this->params()),
40  INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP",
41  this->params()),
42  BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
43  this->params()),
44  INT_MEMBER(classify_debug_level, 0, "Classify debug level",
45  this->params()),
46  INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
47  this->params()),
48  double_MEMBER(classify_char_norm_range, 0.2,
49  "Character Normalization Range ...", this->params()),
50  double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
51  this->params()), /* PREV DEFAULT 0.1 */
52  double_MEMBER(classify_max_norm_scale_x, 0.325,
53  "Max char x-norm scale ...",
54  this->params()), /* PREV DEFAULT 0.3 */
55  double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
56  this->params()), /* PREV DEFAULT 0.1 */
57  double_MEMBER(classify_max_norm_scale_y, 0.325,
58  "Max char y-norm scale ...",
59  this->params()), /* PREV DEFAULT 0.3 */
60  double_MEMBER(classify_max_rating_ratio, 1.5,
61  "Veto ratio between classifier ratings", this->params()),
62  double_MEMBER(classify_max_certainty_margin, 5.5,
63  "Veto difference between classifier certainties",
64  this->params()),
65  BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
66  this->params()),
67  BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
68  this->params()),
69  BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
70  "Enable adaptive classifier", this->params()),
71  BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
72  "Use pre-adapted classifier templates", this->params()),
73  BOOL_MEMBER(classify_save_adapted_templates, 0,
74  "Save adapted templates to a file", this->params()),
75  BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
76  this->params()),
77  BOOL_MEMBER(classify_nonlinear_norm, 0,
78  "Non-linear stroke-density normalization", this->params()),
79  INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
80  INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
81  INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
82  this->params()),
83  double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
84  this->params()),
85  double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
86  this->params()),
87  double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
88  this->params()),
89  double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
90  this->params()),
91  double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
92  this->params()),
93  double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
94  this->params()),
95  INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
96  this->params()),
97  INT_MEMBER(matcher_min_examples_for_prototyping, 3,
98  "Reliable Config Threshold", this->params()),
99  INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
100  "Enable adaption even if the ambiguities have not been seen",
101  this->params()),
102  double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
103  "Maximum angle delta for prototype clustering",
104  this->params()),
105  double_MEMBER(classify_misfit_junk_penalty, 0.0,
106  "Penalty to apply when a non-alnum is vertically out of "
107  "its expected textline position",
108  this->params()),
109  double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
110  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
111  this->params()),
112  double_MEMBER(tessedit_class_miss_scale, 0.00390625,
113  "Scale factor for features not used", this->params()),
115  classify_adapted_pruning_factor, 2.5,
116  "Prune poor adapted results this much worse than best result",
117  this->params()),
118  double_MEMBER(classify_adapted_pruning_threshold, -1.0,
119  "Threshold at which classify_adapted_pruning_factor starts",
120  this->params()),
121  INT_MEMBER(classify_adapt_proto_threshold, 230,
122  "Threshold for good protos during adaptive 0-255",
123  this->params()),
124  INT_MEMBER(classify_adapt_feature_threshold, 230,
125  "Threshold for good features during adaptive 0-255",
126  this->params()),
128  "Do not include character fragments in the"
129  " results of the classifier",
130  this->params()),
131  double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
132  -3.0,
133  "Exclude fragments that do not look like whole"
134  " characters from training and adaption",
135  this->params()),
136  BOOL_MEMBER(classify_debug_character_fragments, FALSE,
137  "Bring up graphical debugging windows for fragments training",
138  this->params()),
139  BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
140  "Use two different windows for debugging the matching: "
141  "One for the protos and one for the features.",
142  this->params()),
143  STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
144  this->params()),
145  INT_MEMBER(classify_class_pruner_threshold, 229,
146  "Class Pruner Threshold 0-255", this->params()),
147  INT_MEMBER(classify_class_pruner_multiplier, 15,
148  "Class Pruner Multiplier 0-255: ", this->params()),
149  INT_MEMBER(classify_cp_cutoff_strength, 7,
150  "Class Pruner CutoffStrength: ", this->params()),
152  "Integer Matcher Multiplier 0-255: ", this->params()),
153  EnableLearning(true),
154  INT_MEMBER(il1_adaption_test, 0,
155  "Don't adapt to i/I at beginning of word", this->params()),
156  BOOL_MEMBER(classify_bln_numeric_mode, 0,
157  "Assume the input is numbers [0-9].", this->params()),
158  double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
159  this->params()),
160  double_MEMBER(speckle_rating_penalty, 10.0,
161  "Penalty to add to worst rating for noise", this->params()),
162  shape_table_(NULL),
163  dict_(this),
164  static_classifier_(NULL) {
165  fontinfo_table_.set_compare_callback(
167  fontinfo_table_.set_clear_callback(
169  fontset_table_.set_compare_callback(
171  fontset_table_.set_clear_callback(
173  AdaptedTemplates = NULL;
174  BackupAdaptedTemplates = NULL;
175  PreTrainedTemplates = NULL;
176  AllProtosOn = NULL;
177  AllConfigsOn = NULL;
178  AllConfigsOff = NULL;
179  TempProtoMask = NULL;
180  NormProtos = NULL;
181 
182  NumAdaptationsFailed = 0;
183 
184  learn_debug_win_ = NULL;
185  learn_fragmented_word_debug_win_ = NULL;
186  learn_fragments_debug_win_ = NULL;
187 
188  CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
189  BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
190 }
191 
194  delete learn_debug_win_;
195  delete learn_fragmented_word_debug_win_;
196  delete learn_fragments_debug_win_;
197  delete[] CharNormCutoffs;
198  delete[] BaselineCutoffs;
199 }
200 
201 
202 // Takes ownership of the given classifier, and uses it for future calls
203 // to CharNormClassifier.
205  delete static_classifier_;
206  static_classifier_ = static_classifier;
207 }
208 
209 // Moved from speckle.cpp
210 // Adds a noise classification result that is a bit worse than the worst
211 // current result, or the worst possible result if no current results.
212 void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
213  BLOB_CHOICE_IT bc_it(choices);
214  // If there is no classifier result, we will use the worst possible certainty
215  // and corresponding rating.
216  float certainty = -getDict().certainty_scale;
217  float rating = rating_scale * blob_length;
218  if (!choices->empty() && blob_length > 0) {
219  bc_it.move_to_last();
220  BLOB_CHOICE* worst_choice = bc_it.data();
221  // Add speckle_rating_penalty to worst rating, matching old value.
222  rating = worst_choice->rating() + speckle_rating_penalty;
223  // Compute the rating to correspond to the certainty. (Used to be kept
224  // the same, but that messes up the language model search.)
225  certainty = -rating * getDict().certainty_scale /
226  (rating_scale * blob_length);
227  }
228  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
229  -1, 0.0f, MAX_FLOAT32, 0,
231  bc_it.add_to_end(blob_choice);
232 }
233 
234 // Returns true if the blob is small enough to be a large speckle.
235 bool Classify::LargeSpeckle(const TBLOB &blob) {
236  double speckle_size = kBlnXHeight * speckle_large_max_size;
237  TBOX bbox = blob.bounding_box();
238  return bbox.width() < speckle_size && bbox.height() < speckle_size;
239 }
240 
241 
242 } // namespace tesseract
UnicityTable< FontSet > fontset_table_
Definition: classify.h:496
#define TRUE
Definition: capi.h:45
double certainty_scale
Definition: dict.h:609
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:235
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:473
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:139
inT16 width() const
Definition: rect.h:111
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:146
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307
BIT_VECTOR TempProtoMask
Definition: classify.h:483
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:469
bool disable_character_fragments
TBOX bounding_box() const
Definition: blobs.cpp:482
int classify_integer_matcher_multiplier
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488
#define MAX_FLOAT32
Definition: host.h:57
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:128
unsigned short uinT16
Definition: host.h:34
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
float rating() const
Definition: ratngs.h:79
#define FALSE
Definition: capi.h:46
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
BIT_VECTOR AllProtosOn
Definition: classify.h:480
const int kBlnXHeight
Definition: normalis.h:28
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
Dict & getDict()
Definition: classify.h:65
virtual ~Classify()
Definition: classify.cpp:192
BIT_VECTOR AllConfigsOn
Definition: classify.h:481
inT16 height() const
Definition: rect.h:104
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:212
Definition: blobs.h:261
BIT_VECTOR AllConfigsOff
Definition: classify.h:482
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:456
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
double speckle_large_max_size
Definition: classify.h:501
Definition: rect.h:30
void SetStaticClassifier(ShapeClassifier *static_classifier)
Definition: classify.cpp:204
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:477
NORM_PROTOS * NormProtos
Definition: classify.h:486
double speckle_rating_penalty
Definition: classify.h:503