tesseract  3.05.02
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 /*----------------------------------------------------------------------------
24  Include Files and Type Defines
25 ----------------------------------------------------------------------------*/
26 #include "oldlist.h"
27 #include "efio.h"
28 #include "emalloc.h"
29 #include "featdefs.h"
30 #include "tessopt.h"
31 #include "ocrfeatures.h"
32 #include "clusttool.h"
33 #include "cluster.h"
34 #include <string.h>
35 #include <stdio.h>
36 #include <math.h>
37 #include "unichar.h"
38 #include "commontraining.h"
39 
40 #define PROGRAM_FEATURE_TYPE "cn"
41 
43 
44 /*----------------------------------------------------------------------------
45  Public Function Prototypes
46 ----------------------------------------------------------------------------*/
47 int main (
48  int argc,
49  char **argv);
50 
51 /*----------------------------------------------------------------------------
52  Private Function Prototypes
53 ----------------------------------------------------------------------------*/
54 
55 void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
56  const FEATURE_DESC_STRUCT *feature_desc);
57 
58 /*
59 PARAMDESC *ConvertToPARAMDESC(
60  PARAM_DESC* Param_Desc,
61  int N);
62 */
63 
64 void WriteProtos(
65  FILE *File,
66  uinT16 N,
67  LIST ProtoList,
68  BOOL8 WriteSigProtos,
69  BOOL8 WriteInsigProtos);
70 
71 /*----------------------------------------------------------------------------
72  Global Data Definitions and Declarations
73 ----------------------------------------------------------------------------*/
74 /* global variable to hold configuration parameters to control clustering */
75 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
77 {
78  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
79 };
80 
81 /*----------------------------------------------------------------------------
82  Public Code
83 ----------------------------------------------------------------------------*/
84 /*---------------------------------------------------------------------------*/
133 int main(int argc, char *argv[]) {
134  // Set the global Config parameters before parsing the command line.
135  Config = CNConfig;
136 
137  const char *PageName;
138  FILE *TrainingPage;
139  LIST CharList = NIL_LIST;
140  CLUSTERER *Clusterer = NULL;
141  LIST ProtoList = NIL_LIST;
142  LIST NormProtoList = NIL_LIST;
143  LIST pCharList;
144  LABELEDLIST CharSample;
145  FEATURE_DEFS_STRUCT FeatureDefs;
146  InitFeatureDefs(&FeatureDefs);
147 
148  ParseArguments(&argc, &argv);
149  int num_fonts = 0;
150  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
151  printf("Reading %s ...\n", PageName);
152  TrainingPage = Efopen(PageName, "rb");
154  100, NULL, TrainingPage, &CharList);
155  fclose(TrainingPage);
156  ++num_fonts;
157  }
158  printf("Clustering ...\n");
159  // To allow an individual font to form a separate cluster,
160  // reduce the min samples:
161  // Config.MinSamples = 0.5 / num_fonts;
162  pCharList = CharList;
163  // The norm protos will count the source protos, so we keep them here in
164  // freeable_protos, so they can be freed later.
165  GenericVector<LIST> freeable_protos;
166  iterate(pCharList) {
167  //Cluster
168  CharSample = (LABELEDLIST)first_node(pCharList);
169  Clusterer =
170  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
171  if (Clusterer == NULL) { // To avoid a SIGSEGV
172  fprintf(stderr, "Error: NULL clusterer!\n");
173  return 1;
174  }
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  freeable_protos.push_back(ProtoList);
194  FreeClusterer(Clusterer);
195  }
196  FreeTrainingSamples(CharList);
197  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
198  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
199  FeatureDefs.FeatureDesc[desc_index]);
200  FreeNormProtoList(NormProtoList);
201  for (int i = 0; i < freeable_protos.size(); ++i) {
202  FreeProtoList(&freeable_protos[i]);
203  }
204  printf ("\n");
205  return 0;
206 } // main
207 
208 /*----------------------------------------------------------------------------
209  Private Code
210 ----------------------------------------------------------------------------*/
211 
212 /*----------------------------------------------------------------------------*/
224 void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
225  const FEATURE_DESC_STRUCT *feature_desc) {
226  FILE *File;
227  STRING Filename;
228  LABELEDLIST LabeledProto;
229  int N;
230 
231  Filename = "";
232  if (Directory != NULL && Directory[0] != '\0')
233  {
234  Filename += Directory;
235  Filename += "/";
236  }
237  Filename += "normproto";
238  printf ("\nWriting %s ...", Filename.string());
239  File = Efopen (Filename.string(), "wb");
240  fprintf(File, "%0d\n", feature_desc->NumParams);
241  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
242  iterate(LabeledProtoList)
243  {
244  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
245  N = NumberOfProtos(LabeledProto->List, true, false);
246  if (N < 1) {
247  printf ("\nError! Not enough protos for %s: %d protos"
248  " (%d significant protos"
249  ", %d insignificant protos)\n",
250  LabeledProto->Label, N,
251  NumberOfProtos(LabeledProto->List, 1, 0),
252  NumberOfProtos(LabeledProto->List, 0, 1));
253  exit(1);
254  }
255  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
256  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
257  }
258  fclose (File);
259 
260 } // WriteNormProtos
261 
262 /*-------------------------------------------------------------------------*/
264  FILE *File,
265  uinT16 N,
266  LIST ProtoList,
267  BOOL8 WriteSigProtos,
268  BOOL8 WriteInsigProtos)
269 {
270  PROTOTYPE *Proto;
271 
272  // write prototypes
273  iterate(ProtoList)
274  {
275  Proto = (PROTOTYPE *) first_node ( ProtoList );
276  if (( Proto->Significant && WriteSigProtos ) ||
277  ( ! Proto->Significant && WriteInsigProtos ) )
278  WritePrototype( File, N, Proto );
279  }
280 } // WriteProtos
void FreeNormProtoList(LIST CharList)
#define first_node(l)
Definition: oldlist.h:139
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
void ParseArguments(int *argc, char ***argv)
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:76
unsigned char BOOL8
Definition: host.h:46
int push_back(T object)
unsigned Significant
Definition: cluster.h:68
const char * string() const
Definition: strngs.cpp:201
unsigned short uinT16
Definition: host.h:34
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:547
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FLOAT32 MinSamples
Definition: cluster.h:50
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
const char * GetNextFilename(int argc, const char *const *argv)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:513
void FreeTrainingSamples(LIST CharList)
int MagicSamples
Definition: cluster.h:55
int main(int argc, char **argv)
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
DECLARE_STRING_PARAM_FLAG(D)
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:40
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
Definition: cntraining.cpp:224
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:319
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:574
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:263