tesseract  3.05.02
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  ** History: Wed Dec 19 16:18:06 1990, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 /*----------------------------------------------------------------------------
19  Include Files and Type Defines
20 ----------------------------------------------------------------------------*/
21 #include "normmatch.h"
22 
23 #include <stdio.h>
24 #include <math.h>
25 
26 #include "classify.h"
27 #include "clusttool.h"
28 #include "const.h"
29 #include "efio.h"
30 #include "emalloc.h"
31 #include "globals.h"
32 #include "helpers.h"
33 #include "normfeat.h"
34 #include "scanutils.h"
35 #include "unicharset.h"
36 #include "params.h"
37 
39 {
40  int NumParams;
43  int NumProtos;
44 };
45 
46 /*----------------------------------------------------------------------------
47  Private Function Prototypes
48 ----------------------------------------------------------------------------*/
49 double NormEvidenceOf(register double NormAdj);
50 
51 void PrintNormMatch(FILE *File,
52  int NumParams,
53  PROTOTYPE *Proto,
54  FEATURE Feature);
55 
56 NORM_PROTOS *ReadNormProtos(FILE *File);
57 
58 /*----------------------------------------------------------------------------
59  Variables
60 ----------------------------------------------------------------------------*/
61 
63 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
64 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
66 const double kWidthErrorWeighting = 0.125;
67 
68 /*----------------------------------------------------------------------------
69  Public Code
70 ----------------------------------------------------------------------------*/
71 /*---------------------------------------------------------------------------*/
72 namespace tesseract {
89  const FEATURE_STRUCT& feature,
90  BOOL8 DebugMatch) {
91  LIST Protos;
92  FLOAT32 BestMatch;
93  FLOAT32 Match;
94  FLOAT32 Delta;
95  PROTOTYPE *Proto;
96  int ProtoId;
97 
98  if (ClassId >= NormProtos->NumProtos) {
99  ClassId = NO_CLASS;
100  }
101 
102  /* handle requests for classification as noise */
103  if (ClassId == NO_CLASS) {
104  /* kludge - clean up constants and make into control knobs later */
105  Match = (feature.Params[CharNormLength] *
106  feature.Params[CharNormLength] * 500.0 +
107  feature.Params[CharNormRx] *
108  feature.Params[CharNormRx] * 8000.0 +
109  feature.Params[CharNormRy] *
110  feature.Params[CharNormRy] * 8000.0);
111  return (1.0 - NormEvidenceOf (Match));
112  }
113 
114  BestMatch = MAX_FLOAT32;
115  Protos = NormProtos->Protos[ClassId];
116 
117  if (DebugMatch) {
118  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
119  }
120 
121  ProtoId = 0;
122  iterate(Protos) {
123  Proto = (PROTOTYPE *) first_node (Protos);
124  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
125  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
126  if (DebugMatch) {
127  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
128  Proto->Mean[CharNormY], Delta,
129  Proto->Weight.Elliptical[CharNormY], Match);
130  }
131  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
132  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
133  if (DebugMatch) {
134  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
135  Proto->Mean[CharNormRx], Delta,
136  Proto->Weight.Elliptical[CharNormRx], Match);
137  }
138  // Ry is width! See intfx.cpp.
139  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
140  if (DebugMatch) {
141  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
142  Proto->Mean[CharNormRy], Delta,
143  Proto->Weight.Elliptical[CharNormRy]);
144  }
145  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
146  Delta *= kWidthErrorWeighting;
147  Match += Delta;
148  if (DebugMatch) {
149  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
150  Match, Match / classify_norm_adj_midpoint,
151  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
152  }
153 
154  if (Match < BestMatch)
155  BestMatch = Match;
156 
157  ProtoId++;
158  }
159  return 1.0 - NormEvidenceOf(BestMatch);
160 } /* ComputeNormMatch */
161 
163  if (NormProtos != NULL) {
164  for (int i = 0; i < NormProtos->NumProtos; i++)
168  Efree(NormProtos);
169  NormProtos = NULL;
170  }
171 }
172 } // namespace tesseract
173 
174 /*----------------------------------------------------------------------------
175  Private Code
176 ----------------------------------------------------------------------------*/
184 double NormEvidenceOf(register double NormAdj) {
185  NormAdj /= classify_norm_adj_midpoint;
186 
187  if (classify_norm_adj_curl == 3)
188  NormAdj = NormAdj * NormAdj * NormAdj;
189  else if (classify_norm_adj_curl == 2)
190  NormAdj = NormAdj * NormAdj;
191  else
192  NormAdj = pow (NormAdj, classify_norm_adj_curl);
193  return (1.0 / (1.0 + NormAdj));
194 }
195 
196 
197 /*---------------------------------------------------------------------------*/
209 void PrintNormMatch(FILE *File,
210  int NumParams,
211  PROTOTYPE *Proto,
212  FEATURE Feature) {
213  int i;
214  FLOAT32 ParamMatch;
215  FLOAT32 TotalMatch;
216 
217  for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
218  ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
219  StandardDeviation(Proto, i);
220 
221  fprintf (File, " %6.1f", ParamMatch);
222 
223  if (i == CharNormY || i == CharNormRx)
224  TotalMatch += ParamMatch * ParamMatch;
225  }
226  fprintf (File, " --> %6.1f (%4.2f)\n",
227  TotalMatch, NormEvidenceOf (TotalMatch));
228 
229 } /* PrintNormMatch */
230 
231 
232 /*---------------------------------------------------------------------------*/
233 namespace tesseract {
247  int i;
248  char unichar[2 * UNICHAR_LEN + 1];
249  UNICHAR_ID unichar_id;
250  LIST Protos;
251  int NumProtos;
252 
253  /* allocate and initialization data structure */
254  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
256  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
257  for (i = 0; i < NormProtos->NumProtos; i++)
258  NormProtos->Protos[i] = NIL_LIST;
259 
260  /* read file header and save in data structure */
263 
264  /* read protos for each class into a separate list */
265  while ((end_offset < 0 || ftell(File) < end_offset) &&
266  tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
267  if (unicharset.contains_unichar(unichar)) {
268  unichar_id = unicharset.unichar_to_id(unichar);
269  Protos = NormProtos->Protos[unichar_id];
270  for (i = 0; i < NumProtos; i++)
271  Protos =
273  NormProtos->Protos[unichar_id] = Protos;
274  } else {
275  cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
276  unichar);
277  for (i = 0; i < NumProtos; i++)
279  }
280  SkipNewline(File);
281  }
282  return (NormProtos);
283 } /* ReadNormProtos */
284 } // namespace tesseract
#define first_node(l)
Definition: oldlist.h:139
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:41
void cprintf(const char *format,...)
Definition: callcpp.cpp:40
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
PROTOTYPE * ReadPrototype(FILE *File, uinT16 N)
Definition: clusttool.cpp:114
const double kWidthErrorWeighting
Definition: normmatch.cpp:66
bool TESS_API contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int size() const
Definition: unicharset.h:297
LIST * Protos
Definition: normmatch.cpp:42
#define NIL_LIST
Definition: oldlist.h:126
double classify_norm_adj_midpoint
Definition: normmatch.cpp:63
void Efree(void *ptr)
Definition: emalloc.cpp:79
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
unsigned char BOOL8
Definition: host.h:46
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:35
#define MAX_FLOAT32
Definition: host.h:57
NORM_PROTOS * ReadNormProtos(FILE *File, inT64 end_offset)
Definition: normmatch.cpp:245
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:332
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:663
#define UNICHAR_LEN
Definition: unichar.h:30
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
Definition: normmatch.cpp:88
void FreePrototype(void *arg)
Definition: cluster.cpp:588
float FLOAT32
Definition: host.h:44
long long int inT64
Definition: host.h:41
#define NO_CLASS
Definition: matchdefs.h:36
FLOAT32 * Mean
Definition: cluster.h:78
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define tprintf(...)
Definition: tprintf.h:31
PARAM_DESC * ReadParamDesc(FILE *File, uinT16 N)
Definition: clusttool.cpp:67
double NormEvidenceOf(register double NormAdj)
Definition: normmatch.cpp:184
uinT16 ReadSampleSize(FILE *File)
Definition: clusttool.cpp:44
#define iterate(l)
Definition: oldlist.h:159
void * Emalloc(int Size)
Definition: emalloc.cpp:47
NORM_PROTOS * ReadNormProtos(FILE *File)
double classify_norm_adj_curl
Definition: normmatch.cpp:64
UNICHARSET unicharset
Definition: ccutil.h:70
#define double_VAR(name, val, comment)
Definition: params.h:286
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
NORM_PROTOS * NormProtos
Definition: classify.h:486
void SkipNewline(FILE *file)
Definition: helpers.h:84
FLOATUNION Weight
Definition: cluster.h:83
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:574
void PrintNormMatch(FILE *File, int NumParams, PROTOTYPE *Proto, FEATURE Feature)
Definition: normmatch.cpp:209
int UNICHAR_ID
Definition: unichar.h:33
FLOAT32 * Elliptical
Definition: cluster.h:64