tesseract  3.05.02
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

 DECLARE_STRING_PARAM_FLAG (D)
 
int main (int argc, char **argv)
 
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
 
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
 
int main (int argc, char *argv[])
 

Variables

CLUSTERCONFIG CNConfig
 

Macro Definition Documentation

◆ PROGRAM_FEATURE_TYPE

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 40 of file cntraining.cpp.

Function Documentation

◆ DECLARE_STRING_PARAM_FLAG()

DECLARE_STRING_PARAM_FLAG ( )

◆ main() [1/2]

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 388 of file tesseractmain.cpp.

388  {
389  const char* lang = "eng";
390  const char* image = NULL;
391  const char* outputbase = NULL;
392  const char* datapath = NULL;
393  bool list_langs = false;
394  bool print_parameters = false;
395  int arg_i = 1;
398  /* main() calls functions like ParseArgs which call exit().
399  * This results in memory leaks if vars_vec and vars_values are
400  * declared as auto variables (destructor is not called then). */
401  static GenericVector<STRING> vars_vec;
402  static GenericVector<STRING> vars_values;
403 
404 #ifdef NDEBUG
405  // Disable debugging and informational messages from Leptonica.
406  setMsgSeverity(L_SEVERITY_ERROR);
407 #endif
408 
409 #if defined(HAVE_TIFFIO_H) && defined(_WIN32)
410  /* Show libtiff warnings on console (not in GUI). */
411  TIFFSetWarningHandler(Win32WarningHandler);
412 #endif /* HAVE_TIFFIO_H && _WIN32 */
413 
414  ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
415  &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
416  &enginemode);
417 
418  bool banner = false;
419  if (outputbase != NULL && strcmp(outputbase, "-") &&
420  strcmp(outputbase, "stdout")) {
421  banner = true;
422  }
423 
424  PERF_COUNT_START("Tesseract:main")
425 
426  // Call GlobalDawgCache here to create the global DawgCache object before
427  // the TessBaseAPI object. This fixes the order of destructor calls:
428  // first TessBaseAPI must be destructed, DawgCache must be the last object.
429  tesseract::Dict::GlobalDawgCache();
430 
431  // Avoid memory leak caused by auto variable when exit() is called.
432  static tesseract::TessBaseAPI api;
433 
434  api.SetOutputName(outputbase);
435 
436  int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
437  argc - arg_i, &vars_vec, &vars_values, false);
438  if (init_failed) {
439  fprintf(stderr, "Could not initialize tesseract.\n");
440  return EXIT_FAILURE;
441  }
442 
443  SetVariablesFromCLArgs(&api, argc, argv);
444 
445  if (list_langs) {
447  return EXIT_SUCCESS;
448  }
449 
450  if (print_parameters) {
451  FILE* fout = stdout;
452  fprintf(stdout, "Tesseract parameters:\n");
453  api.PrintVariables(fout);
454  api.End();
455  return EXIT_SUCCESS;
456  }
457 
458  FixPageSegMode(&api, pagesegmode);
459 
460  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
461  int ret_val = EXIT_SUCCESS;
462 
463  Pix* pixs = pixRead(image);
464  if (!pixs) {
465  fprintf(stderr, "Cannot open input file: %s\n", image);
466  return 2;
467  }
468 
469  api.SetImage(pixs);
470 
471  tesseract::Orientation orientation;
474  float deskew_angle;
475 
476  tesseract::PageIterator* it = api.AnalyseLayout();
477  if (it) {
478  it->Orientation(&orientation, &direction, &order, &deskew_angle);
479  tprintf(
480  "Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n"
481  "Deskew angle: %.4f\n",
482  orientation, direction, order, deskew_angle);
483  } else {
484  ret_val = EXIT_FAILURE;
485  }
486 
487  delete it;
488 
489  pixDestroy(&pixs);
490  return ret_val;
491  }
492 
493  // set in_training_mode to true when using one of these configs:
494  // ambigs.train, box.train, box.train.stderr, linebox, rebox
495  bool b = false;
496  bool in_training_mode =
497  (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
498  (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
499  (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
500 
501  // Avoid memory leak caused by auto variable when exit() is called.
503 
504  if (in_training_mode) {
505  renderers.push_back(NULL);
506  } else {
507  PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
508  }
509 
510  if (!renderers.empty()) {
511  if (banner) PrintBanner();
512  bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
513  if (!succeed) {
514  fprintf(stderr, "Error during processing.\n");
515  return EXIT_FAILURE;
516  }
517  }
518 
520 
521  return EXIT_SUCCESS;
522 }
void PrintLangsList(tesseract::TessBaseAPI *api)
void PreloadRenderers(tesseract::TessBaseAPI *api, tesseract::PointerVector< tesseract::TessResultRenderer > *renderers, tesseract::PageSegMode pagesegmode, const char *outputbase)
struct TessBaseAPI TessBaseAPI
Definition: capi.h:86
void SetVariablesFromCLArgs(tesseract::TessBaseAPI *api, int argc, char **argv)
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
int push_back(T * object)
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:155
void FixPageSegMode(tesseract::TessBaseAPI *api, tesseract::PageSegMode pagesegmode)
void PrintBanner()
#define PERF_COUNT_START(FUNCT_NAME)
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:156
#define PERF_COUNT_END
bool empty() const
Definition: genericvector.h:84
void ParseArgs(const int argc, char **argv, const char **lang, const char **image, const char **outputbase, const char **datapath, bool *list_langs, bool *print_parameters, GenericVector< STRING > *vars_vec, GenericVector< STRING > *vars_values, int *arg_i, tesseract::PageSegMode *pagesegmode, tesseract::OcrEngineMode *enginemode)

◆ main() [2/2]

int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.

Definition at line 133 of file cntraining.cpp.

133  {
134  // Set the global Config parameters before parsing the command line.
135  Config = CNConfig;
136 
137  const char *PageName;
138  FILE *TrainingPage;
139  LIST CharList = NIL_LIST;
140  CLUSTERER *Clusterer = NULL;
141  LIST ProtoList = NIL_LIST;
142  LIST NormProtoList = NIL_LIST;
143  LIST pCharList;
144  LABELEDLIST CharSample;
145  FEATURE_DEFS_STRUCT FeatureDefs;
146  InitFeatureDefs(&FeatureDefs);
147 
148  ParseArguments(&argc, &argv);
149  int num_fonts = 0;
150  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
151  printf("Reading %s ...\n", PageName);
152  TrainingPage = Efopen(PageName, "rb");
154  100, NULL, TrainingPage, &CharList);
155  fclose(TrainingPage);
156  ++num_fonts;
157  }
158  printf("Clustering ...\n");
159  // To allow an individual font to form a separate cluster,
160  // reduce the min samples:
161  // Config.MinSamples = 0.5 / num_fonts;
162  pCharList = CharList;
163  // The norm protos will count the source protos, so we keep them here in
164  // freeable_protos, so they can be freed later.
165  GenericVector<LIST> freeable_protos;
166  iterate(pCharList) {
167  //Cluster
168  CharSample = (LABELEDLIST)first_node(pCharList);
169  Clusterer =
170  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
171  if (Clusterer == NULL) { // To avoid a SIGSEGV
172  fprintf(stderr, "Error: NULL clusterer!\n");
173  return 1;
174  }
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  freeable_protos.push_back(ProtoList);
194  FreeClusterer(Clusterer);
195  }
196  FreeTrainingSamples(CharList);
197  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
198  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
199  FeatureDefs.FeatureDesc[desc_index]);
200  FreeNormProtoList(NormProtoList);
201  for (int i = 0; i < freeable_protos.size(); ++i) {
202  FreeProtoList(&freeable_protos[i]);
203  }
204  printf ("\n");
205  return 0;
206 } // main
void FreeNormProtoList(LIST CharList)
#define first_node(l)
Definition: oldlist.h:139
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
void ParseArguments(int *argc, char ***argv)
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:76
int push_back(T object)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:547
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FLOAT32 MinSamples
Definition: cluster.h:50
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
const char * GetNextFilename(int argc, const char *const *argv)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:513
void FreeTrainingSamples(LIST CharList)
int MagicSamples
Definition: cluster.h:55
int size() const
Definition: genericvector.h:72
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:40
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
Definition: cntraining.cpp:224
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:574

◆ WriteNormProtos()

void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
const FEATURE_DESC_STRUCT feature_desc 
)

This routine writes the specified samples into files which are organized according to the font name and character name of the samples.

Parameters
Directorydirectory to place sample files into
LabeledProtoListList of labeled protos
feature_descDescription of the features
Returns
none
Note
Exceptions: none
History: Fri Aug 18 16:17:06 1989, DSJ, Created.

Definition at line 224 of file cntraining.cpp.

225  {
226  FILE *File;
227  STRING Filename;
228  LABELEDLIST LabeledProto;
229  int N;
230 
231  Filename = "";
232  if (Directory != NULL && Directory[0] != '\0')
233  {
234  Filename += Directory;
235  Filename += "/";
236  }
237  Filename += "normproto";
238  printf ("\nWriting %s ...", Filename.string());
239  File = Efopen (Filename.string(), "wb");
240  fprintf(File, "%0d\n", feature_desc->NumParams);
241  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
242  iterate(LabeledProtoList)
243  {
244  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
245  N = NumberOfProtos(LabeledProto->List, true, false);
246  if (N < 1) {
247  printf ("\nError! Not enough protos for %s: %d protos"
248  " (%d significant protos"
249  ", %d insignificant protos)\n",
250  LabeledProto->Label, N,
251  NumberOfProtos(LabeledProto->List, 1, 0),
252  NumberOfProtos(LabeledProto->List, 0, 1));
253  exit(1);
254  }
255  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
256  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
257  }
258  fclose (File);
259 
260 } // WriteNormProtos
#define first_node(l)
Definition: oldlist.h:139
const char * string() const
Definition: strngs.cpp:201
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
Definition: strngs.h:44
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:319
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:263

◆ WriteProtos()

void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 263 of file cntraining.cpp.

269 {
270  PROTOTYPE *Proto;
271 
272  // write prototypes
273  iterate(ProtoList)
274  {
275  Proto = (PROTOTYPE *) first_node ( ProtoList );
276  if (( Proto->Significant && WriteSigProtos ) ||
277  ( ! Proto->Significant && WriteInsigProtos ) )
278  WritePrototype( File, N, Proto );
279  }
280 } // WriteProtos
#define first_node(l)
Definition: oldlist.h:139
unsigned Significant
Definition: cluster.h:68
#define iterate(l)
Definition: oldlist.h:159
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348

Variable Documentation

◆ CNConfig

CLUSTERCONFIG CNConfig
Initial value:
=
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Definition at line 76 of file cntraining.cpp.