tesseract  3.05.02
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #include <stdio.h>
20 #ifndef USE_STD_NAMESPACE
21 #include "base/commandlineflags.h"
22 #endif // USE_STD_NAMESPACE
23 #include "baseapi.h"
24 #include "commontraining.h"
25 #ifndef NO_CUBE_BUILD
26 #include "cubeclassifier.h"
27 #endif // NO_CUBE_BUILD
28 #include "mastertrainer.h"
29 #include "params.h"
30 #include "strngs.h"
31 #include "tessclassifier.h"
32 
33 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
34 STRING_PARAM_FLAG(lang, "eng", "Language to test");
35 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
36 DECLARE_INT_PARAM_FLAG(debug_level);
38 
42 #ifndef NO_CUBE_BUILD
45 #endif // NO_CUBE_BUILD
47 };
48 
49 const char* names[] = {"pruner", "full",
50 #ifndef NO_CUBE_BUILD
51  "cube", "cubetess",
52 #endif // NO_CUBE_BUILD
53  NULL};
54 
55 static tesseract::ShapeClassifier* InitializeClassifier(
56  const char* classifer_name, const UNICHARSET& unicharset,
57  int argc, char **argv,
59  // Decode the classifier string.
60  ClassifierName classifier = CN_COUNT;
61  for (int c = 0; c < CN_COUNT; ++c) {
62  if (strcmp(classifer_name, names[c]) == 0) {
63  classifier = static_cast<ClassifierName>(c);
64  break;
65  }
66  }
67  if (classifier == CN_COUNT) {
68  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
69  return NULL;
70  }
71 
72  // We need to initialize tesseract to test.
75 #ifndef NO_CUBE_BUILD
76  if (classifier == CN_CUBE || classifier == CN_CUBETESS)
78 #endif // NO_CUBE_BUILD
80  tesseract::Classify* classify = NULL;
81  if (
82 #ifndef NO_CUBE_BUILD
83  classifier == CN_CUBE || classifier == CN_CUBETESS ||
84 #endif // NO_CUBE_BUILD
85  classifier == CN_PRUNER || classifier == CN_FULL) {
86 #ifndef NO_CUBE_BUILD
87  (*api)->SetVariable("cube_debug_level", "2");
88 #endif // NO_CUBE_BUILD
89  if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
90  engine_mode) < 0) {
91  fprintf(stderr, "Tesseract initialization failed!\n");
92  return NULL;
93  }
94  tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
95  classify = reinterpret_cast<tesseract::Classify*>(tesseract);
96  if (classify->shape_table() == NULL) {
97  fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
98  return NULL;
99  }
100  }
101  tesseract::ShapeClassifier* shape_classifier = NULL;
102 
103  if (!FLAGS_T.empty()) {
104  const char* config_name;
105  while ((config_name = GetNextFilename(argc, argv)) != NULL) {
106  tprintf("Reading config file %s ...\n", config_name);
107  (*api)->ReadConfigFile(config_name);
108  }
109  }
110  if (classifier == CN_PRUNER) {
111  shape_classifier = new tesseract::TessClassifier(true, classify);
112  } else if (classifier == CN_FULL) {
113  shape_classifier = new tesseract::TessClassifier(false, classify);
114 #ifndef NO_CUBE_BUILD
115  } else if (classifier == CN_CUBE) {
116  shape_classifier = new tesseract::CubeClassifier(tesseract);
117  } else if (classifier == CN_CUBETESS) {
118  shape_classifier = new tesseract::CubeTessClassifier(tesseract);
119 #endif // NO_CUBE_BUILD
120  } else {
121  fprintf(stderr, "%s tester not yet implemented\n", classifer_name);
122  return NULL;
123  }
124  tprintf("Testing classifier %s:\n", classifer_name);
125  return shape_classifier;
126 }
127 
128 // This program has complex setup requirements, so here is some help:
129 // Two different modes, tr files and serialized mastertrainer.
130 // From tr files:
131 // classifier_tester -U unicharset -F font_properties -X xheights
132 // -classifier x -lang lang [-output_trainer trainer] *.tr
133 // From a serialized trainer:
134 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
135 //
136 // In the first case, the unicharset must be the unicharset from within
137 // the classifier under test, and the font_properties and xheights files must
138 // match the files used during training.
139 // In the second case, the trainer file must have been prepared from
140 // some previous run of shapeclustering, mftraining, or classifier_tester
141 // using the same conditions as above, ie matching unicharset/font_properties.
142 //
143 // Available values of classifier (x above) are:
144 // pruner : Tesseract class pruner only.
145 // full : Tesseract full classifier.
146 // cube : Cube classifier. (Not possible with an input trainer.)
147 // cubetess : Tesseract class pruner with rescoring by Cube. (Not possible
148 // with an input trainer.)
149 int main(int argc, char **argv) {
150  ParseArguments(&argc, &argv);
151  STRING file_prefix;
153  argc, argv, false, NULL, &file_prefix);
155  // Decode the classifier string.
156  tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
157  FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
158  if (shape_classifier == NULL) {
159  fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
160  return 1;
161  }
162 
163  // We want to test junk as well if it is available.
164  // trainer->IncludeJunk();
165  // We want to test with replicated samples too.
167 
169  MAX(3, FLAGS_debug_level), false,
170  shape_classifier, NULL);
171  delete shape_classifier;
172  delete api;
173  delete trainer;
174 
175  return 0;
176 } /* main */
177 
178 
179 
180 
181 
182 
struct TessBaseAPI TessBaseAPI
Definition: capi.h:86
const char * names[]
ClassifierName
void ParseArguments(int *argc, char ***argv)
DECLARE_STRING_PARAM_FLAG(T)
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
void ReplicateAndRandomizeSamplesIfRequired()
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
const ShapeTable * shape_table() const
Definition: classify.h:69
const char * GetNextFilename(int argc, const char *const *argv)
STRING_PARAM_FLAG(classifier, "", "Classifier to test")
int main(int argc, char **argv)
#define MAX(x, y)
Definition: ndminx.h:24
#define tprintf(...)
Definition: tprintf.h:31
Definition: strngs.h:44
DECLARE_INT_PARAM_FLAG(debug_level)
const UNICHARSET & unicharset() const