tesseract  3.05.02
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  * Created: Tue Jan 07 15:21:46 GMT 1992
8  *
9  * (C) Copyright 1992, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "stderr.h"
28 #include "basedir.h"
29 #include "tessvars.h"
30 #include "control.h"
31 #include "reject.h"
32 #include "pageres.h"
33 #include "nwmain.h"
34 #include "pgedit.h"
35 #include "tprintf.h"
36 #include "tessedit.h"
37 #include "stopper.h"
38 #include "intmatcher.h"
39 #include "chop.h"
40 #include "efio.h"
41 #include "danerror.h"
42 #include "globals.h"
43 #include "tesseractclass.h"
44 #include "params.h"
45 
46 #define VARDIR "configs/" /*variables files */
47  // config under api
48 #define API_CONFIG "configs/api_config"
49 
50 ETEXT_DESC *global_monitor = NULL; // progress monitor
51 
52 namespace tesseract {
53 
54 // Read a "config" file containing a set of variable, value pairs.
55 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
56 // and also accepts a relative or absolute path name.
58  SetParamConstraint constraint) {
59  STRING path = datadir;
60  path += "configs/";
61  path += filename;
62  FILE* fp;
63  if ((fp = fopen(path.string(), "rb")) != NULL) {
64  fclose(fp);
65  } else {
66  path = datadir;
67  path += "tessconfigs/";
68  path += filename;
69  if ((fp = fopen(path.string(), "rb")) != NULL) {
70  fclose(fp);
71  } else {
72  path = filename;
73  }
74  }
75  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
76 }
77 
78 // Returns false if a unicharset file for the specified language was not found
79 // or was invalid.
80 // This function initializes TessdataManager. After TessdataManager is
81 // no longer needed, TessdataManager::End() should be called.
82 //
83 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
84 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
85 // from the language-specific config file (stored in [lang].traineddata), from
86 // the config files specified on the command line or left as the default
87 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
89  const char *arg0, const char *textbase, const char *language,
90  OcrEngineMode oem, char **configs, int configs_size,
91  const GenericVector<STRING> *vars_vec,
92  const GenericVector<STRING> *vars_values,
93  bool set_only_non_debug_params) {
94  // Set the basename, compute the data directory.
95  main_setup(arg0, textbase);
96 
97  // Set the language data path prefix
98  lang = language != NULL ? language : "eng";
102 
103  // Initialize TessdataManager.
104  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
105  if (!tessdata_manager.Init(tessdata_path.string(),
107  return false;
108  }
109 
110  // If a language specific config file (lang.config) exists, load it in.
117  tprintf("Loaded language config file\n");
118  }
119  }
120 
121  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
123  // Load tesseract variables from config files. This is done after loading
124  // language-specific variables from [lang].traineddata file, so that custom
125  // config files can override values in [lang].traineddata file.
126  for (int i = 0; i < configs_size; ++i) {
127  read_config_file(configs[i], set_params_constraint);
128  }
129 
130  // Set params specified in vars_vec (done after setting params from config
131  // files, so that params in vars_vec can override those from files).
132  if (vars_vec != NULL && vars_values != NULL) {
133  for (int i = 0; i < vars_vec->size(); ++i) {
134  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
135  (*vars_values)[i].string(),
136  set_params_constraint, this->params())) {
137  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
138  exit(1);
139  }
140  }
141  }
142 
143  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
144  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
145  if (params_file != NULL) {
146  ParamUtils::PrintParams(params_file, this->params());
147  fclose(params_file);
149  tprintf("Wrote parameters to %s\n",
151  }
152  } else {
153  tprintf("Failed to open %s for writing params.\n",
155  }
156  }
157 
158  // Determine which ocr engine(s) should be loaded and used for recognition.
159  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
161  tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
162  static_cast<int>(tessedit_ocr_engine_mode));
163  }
164 
165  // If we are only loading the config file (and so not planning on doing any
166  // recognition) then there's nothing else do here.
169  tprintf("Returning after loading config file\n");
170  }
171  return true;
172  }
173 
174  // Load the unicharset
177  return false;
178  }
179  if (unicharset.size() > MAX_NUM_CLASSES) {
180  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
181  return false;
182  }
183  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
184  right_to_left_ = unicharset.major_right_to_left();
185 
186  // Setup initial unichar ambigs table and read universal ambigs.
187  UNICHARSET encoder_unicharset;
188  encoder_unicharset.CopyFrom(unicharset);
190  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
191 
194  TFile ambigs_file;
195  ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
198  encoder_unicharset,
199  &ambigs_file,
201  if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
202  }
203 
204  // The various OcrEngineMode settings (see publictypes.h) determine which
205  // engine-specific data files need to be loaded. Currently everything needs
206  // the base tesseract data, which supplies other useful information, but
207  // alternative engines, such as cube and LSTM are optional.
208 #ifndef NO_CUBE_BUILD
212  tprintf("Loaded Cube w/out combiner\n");
216  tprintf("Loaded Cube with combiner\n");
217  }
218 #endif
219  // Init ParamsModel.
220  // Load pass1 and pass2 weights (for now these two sets are the same, but in
221  // the future separate sets of weights can be generated).
222  for (int p = ParamsModel::PTRAIN_PASS1;
225  static_cast<ParamsModel::PassEnum>(p));
230  return false;
231  }
232  }
233  }
235 
236  return true;
237 }
238 
239 // Helper returns true if the given string is in the vector of strings.
240 static bool IsStrInList(const STRING& str,
241  const GenericVector<STRING>& str_list) {
242  for (int i = 0; i < str_list.size(); ++i) {
243  if (str_list[i] == str)
244  return true;
245  }
246  return false;
247 }
248 
249 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
250 // Langs with no prefix get appended to to_load, provided they
251 // are not in there already.
252 // Langs with ~ prefix get appended to not_to_load, provided they are not in
253 // there already.
254 void Tesseract::ParseLanguageString(const char* lang_str,
255  GenericVector<STRING>* to_load,
256  GenericVector<STRING>* not_to_load) {
257  STRING remains(lang_str);
258  while (remains.length() > 0) {
259  // Find the start of the lang code and which vector to add to.
260  const char* start = remains.string();
261  while (*start == '+')
262  ++start;
263  GenericVector<STRING>* target = to_load;
264  if (*start == '~') {
265  target = not_to_load;
266  ++start;
267  }
268  // Find the index of the end of the lang code in string start.
269  int end = strlen(start);
270  const char* plus = strchr(start, '+');
271  if (plus != NULL && plus - start < end)
272  end = plus - start;
273  STRING lang_code(start);
274  lang_code.truncate_at(end);
275  STRING next(start + end);
276  remains = next;
277  // Check whether lang_code is already in the target vector and add.
278  if (!IsStrInList(lang_code, *target)) {
280  tprintf("Adding language '%s' to list\n", lang_code.string());
281  target->push_back(lang_code);
282  }
283  }
284 }
285 
286 // Initialize for potentially a set of languages defined by the language
287 // string and recursively any additional languages required by any language
288 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
289 // See init_tesseract_internal for args.
291  const char *arg0, const char *textbase, const char *language,
292  OcrEngineMode oem, char **configs, int configs_size,
293  const GenericVector<STRING> *vars_vec,
294  const GenericVector<STRING> *vars_values,
295  bool set_only_non_debug_params) {
296  GenericVector<STRING> langs_to_load;
297  GenericVector<STRING> langs_not_to_load;
298  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
299 
300  sub_langs_.delete_data_pointers();
301  sub_langs_.clear();
302  // Find the first loadable lang and load into this.
303  // Add any languages that this language requires
304  bool loaded_primary = false;
305  // Load the rest into sub_langs_.
306  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
307  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
308  const char *lang_str = langs_to_load[lang_index].string();
309  Tesseract *tess_to_init;
310  if (!loaded_primary) {
311  tess_to_init = this;
312  } else {
313  tess_to_init = new Tesseract;
314  }
315 
316  int result = tess_to_init->init_tesseract_internal(
317  arg0, textbase, lang_str, oem, configs, configs_size,
318  vars_vec, vars_values, set_only_non_debug_params);
319 
320  if (!loaded_primary) {
321  if (result < 0) {
322  tprintf("Failed loading language '%s'\n", lang_str);
323  } else {
325  tprintf("Loaded language '%s' as main language\n", lang_str);
326  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
327  &langs_to_load, &langs_not_to_load);
328  loaded_primary = true;
329  }
330  } else {
331  if (result < 0) {
332  tprintf("Failed loading language '%s'\n", lang_str);
333  delete tess_to_init;
334  } else {
336  tprintf("Loaded language '%s' as secondary language\n", lang_str);
337  sub_langs_.push_back(tess_to_init);
338  // Add any languages that this language requires
339  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
340  &langs_to_load, &langs_not_to_load);
341  }
342  }
343  }
344  }
345  if (!loaded_primary) {
346  tprintf("Tesseract couldn't load any languages!\n");
347  return -1; // Couldn't load any language!
348  }
349  if (!sub_langs_.empty()) {
350  // In multilingual mode word ratings have to be directly comparable,
351  // so use the same language model weights for all languages:
352  // use the primary language's params model if
353  // tessedit_use_primary_params_model is set,
354  // otherwise use default language model weights.
356  for (int s = 0; s < sub_langs_.size(); ++s) {
357  sub_langs_[s]->language_model_->getParamsModel().Copy(
359  }
360  tprintf("Using params model of the primary language\n");
363  }
364  } else {
366  for (int s = 0; s < sub_langs_.size(); ++s) {
367  sub_langs_[s]->language_model_->getParamsModel().Clear();
368  }
370  tprintf("Using default language params\n");
371  }
372  }
373 
375  return 0;
376 }
377 
378 // Common initialization for a single language.
379 // arg0 is the datapath for the tessdata directory, which could be the
380 // path of the tessdata directory with no trailing /, or (if tessdata
381 // lives in the same directory as the executable, the path of the executable,
382 // hence the name arg0.
383 // textbase is an optional output file basename (used only for training)
384 // language is the language code to load.
385 // oem controls which engine(s) will operate on the image
386 // configs (argv) is an array of config filenames to load variables from.
387 // May be NULL.
388 // configs_size (argc) is the number of elements in configs.
389 // vars_vec is an optional vector of variables to set.
390 // vars_values is an optional corresponding vector of values for the variables
391 // in vars_vec.
392 // If set_only_init_params is true, then only the initialization variables
393 // will be set.
395  const char *arg0, const char *textbase, const char *language,
396  OcrEngineMode oem, char **configs, int configs_size,
397  const GenericVector<STRING> *vars_vec,
398  const GenericVector<STRING> *vars_values,
399  bool set_only_non_debug_params) {
400  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
401  configs_size, vars_vec, vars_values,
402  set_only_non_debug_params)) {
403  return -1;
404  }
407  return 0;
408  }
409  // If only Cube will be used, skip loading Tesseract classifier's
410  // pre-trained templates.
411  bool init_tesseract_classifier =
414  // If only Cube will be used and if it has its own Unicharset,
415  // skip initializing permuter and loading Tesseract Dawgs.
416  bool init_dict =
419  program_editup(textbase, init_tesseract_classifier, init_dict);
421  return 0; //Normal exit
422 }
423 
424 // Helper builds the all_fonts table by adding new fonts from new_fonts.
425 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
426  UnicityTable<FontInfo>* all_fonts) {
427  for (int i = 0; i < new_fonts.size(); ++i) {
428  // UnicityTable uniques as we go.
429  all_fonts->push_back(new_fonts.get(i));
430  }
431 }
432 
433 // Helper assigns an id to lang_fonts using the index in all_fonts table.
434 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
435  UnicityTable<FontInfo>* lang_fonts) {
436  for (int i = 0; i < lang_fonts->size(); ++i) {
437  int index = all_fonts.get_id(lang_fonts->get(i));
438  lang_fonts->get_mutable(i)->universal_id = index;
439  }
440 }
441 
442 // Set the universal_id member of each font to be unique among all
443 // instances of the same font loaded.
445  // Note that we can get away with bitwise copying FontInfo in
446  // all_fonts, as it is a temporary structure and we avoid setting the
447  // delete callback.
448  UnicityTable<FontInfo> all_fonts;
450 
451  // Create the universal ID table.
452  CollectFonts(get_fontinfo_table(), &all_fonts);
453  for (int i = 0; i < sub_langs_.size(); ++i) {
454  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
455  }
456  // Assign ids from the table to each font table.
457  AssignIds(all_fonts, &get_fontinfo_table());
458  for (int i = 0; i < sub_langs_.size(); ++i) {
459  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
460  }
461  font_table_size_ = all_fonts.size();
462 }
463 
464 // init the LM component
465 int Tesseract::init_tesseract_lm(const char *arg0,
466  const char *textbase,
467  const char *language) {
468  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
469  NULL, 0, NULL, NULL, false))
470  return -1;
473  getDict().FinishLoad();
475  return 0;
476 }
477 
479  end_recog();
480 }
481 
482 /* Define command type identifiers */
483 
485 {
490 };
491 } // namespace tesseract
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
inT64 GetEndOffset(TessdataType tessdata_type) const
char * tessedit_write_params_to_file
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language)
Definition: tessedit.cpp:465
STRING lang
Definition: ccutil.h:67
inT32 length() const
Definition: strngs.cpp:196
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
ParamsModel & getParamsModel()
int size() const
Definition: unicharset.h:297
const STRING & GetDataFileName() const
bool use_ambigs_for_adaption
Definition: ccutil.h:91
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:35
bool FinishLoad()
Definition: dict.cpp:307
void Load(const char *data_file_name, const STRING &lang)
Definition: dict.cpp:224
void SetPass(PassEnum pass)
Definition: params_model.h:72
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:394
ETEXT_DESC * global_monitor
Definition: tessedit.cpp:50
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:57
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:206
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:71
STRING datadir
Definition: ccutil.h:65
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)
int push_back(T object)
TessdataManager tessdata_manager
Definition: ccutil.h:69
void SetupUniversalFontIds()
Definition: tessedit.cpp:444
int size() const
Return the size used.
void truncate_at(inT32 index)
Definition: strngs.cpp:272
const char * string() const
Definition: strngs.cpp:201
bool Init(const char *data_file_name, int debug_level)
bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset)
bool major_right_to_left() const
Definition: unicharset.cpp:931
Dict & getDict()
Definition: classify.h:65
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void program_editup(const char *textbase, bool init_classifier, bool init_permute)
Definition: tface.cpp:46
ParamsVectors * params()
Definition: ccutil.h:63
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:97
#define tprintf(...)
Definition: tprintf.h:31
LanguageModel * language_model_
Definition: wordrec.h:411
Definition: strngs.h:44
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:254
int size() const
Definition: genericvector.h:72
int ambigs_debug_level
Definition: ccutil.h:87
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:65
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:88
static bool TESS_API ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
STRING language_data_path_prefix
Definition: ccutil.h:68
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
bool SeekToStart(TessdataType tessdata_type)
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
T * get_mutable(int id)
const T & get(int id) const
Return the object from an id.
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:179
UNICHARSET unicharset
Definition: ccutil.h:70
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:290
int get_id(T object) const
int push_back(T object)
Add an element in the table.
#define ASSERT_HOST(x)
Definition: errcode.h:84
SetParamConstraint
Definition: params.h:36