tesseract  3.05.02
clusttool.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: clustertool.c
3  ** Purpose: Misc. tools for use with the clustering routines
4  ** Author: Dan Johnson
5  ** History: 6/6/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 //--------------------------Include Files----------------------------------
20 #include "clusttool.h"
21 #include "const.h"
22 #include "danerror.h"
23 #include "emalloc.h"
24 #include "scanutils.h"
25 #include <stdio.h>
26 #include <math.h>
27 
28 //---------------Global Data Definitions and Declarations--------------------
29 #define TOKENSIZE 80 //< max size of tokens read from an input file
30 #define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space
31 //#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block
32 // size)
33 
44 uinT16 ReadSampleSize(FILE *File) {
45  int SampleSize;
46 
47  if ((tfscanf(File, "%d", &SampleSize) != 1) ||
48  (SampleSize < 0) || (SampleSize > MAXSAMPLESIZE))
49  DoError (ILLEGALSAMPLESIZE, "Illegal sample size");
50  return (SampleSize);
51 }
52 
67 PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
68  int i;
69  PARAM_DESC *ParamDesc;
70  char Token[TOKENSIZE];
71 
72  ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
73  for (i = 0; i < N; i++) {
74  if (tfscanf(File, "%s", Token) != 1)
76  "Illegal circular/linear specification");
77  if (Token[0] == 'c')
78  ParamDesc[i].Circular = TRUE;
79  else
80  ParamDesc[i].Circular = FALSE;
81 
82  if (tfscanf(File, "%s", Token) != 1)
84  "Illegal essential/non-essential spec");
85  if (Token[0] == 'e')
86  ParamDesc[i].NonEssential = FALSE;
87  else
88  ParamDesc[i].NonEssential = TRUE;
89  if (tfscanf(File, "%f%f", &(ParamDesc[i].Min), &(ParamDesc[i].Max)) != 2)
90  DoError (ILLEGALMINMAXSPEC, "Illegal min or max specification");
91  ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
92  ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
93  ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
94  }
95  return (ParamDesc);
96 }
97 
114 PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
115  char Token[TOKENSIZE];
116  int Status;
117  PROTOTYPE *Proto;
118  int SampleCount;
119  int i;
120 
121  if ((Status = tfscanf(File, "%s", Token)) == 1) {
122  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
123  Proto->Cluster = NULL;
124  if (Token[0] == 's')
125  Proto->Significant = TRUE;
126  else
127  Proto->Significant = FALSE;
128 
129  Proto->Style = ReadProtoStyle (File);
130 
131  if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0))
132  DoError (ILLEGALSAMPLECOUNT, "Illegal sample count");
133  Proto->NumSamples = SampleCount;
134 
135  Proto->Mean = ReadNFloats (File, N, NULL);
136  if (Proto->Mean == NULL)
137  DoError (ILLEGALMEANSPEC, "Illegal prototype mean");
138 
139  switch (Proto->Style) {
140  case spherical:
141  if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL)
142  DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
143  Proto->Magnitude.Spherical =
144  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
145  Proto->TotalMagnitude =
146  pow (Proto->Magnitude.Spherical, (float) N);
147  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
148  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
149  Proto->Distrib = NULL;
150  break;
151  case elliptical:
152  Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
153  if (Proto->Variance.Elliptical == NULL)
154  DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
155  Proto->Magnitude.Elliptical =
156  (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
157  Proto->Weight.Elliptical =
158  (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
159  Proto->TotalMagnitude = 1.0;
160  for (i = 0; i < N; i++) {
161  Proto->Magnitude.Elliptical[i] =
162  1.0 /
163  sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
164  Proto->Weight.Elliptical[i] =
165  1.0 / Proto->Variance.Elliptical[i];
166  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
167  }
168  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
169  Proto->Distrib = NULL;
170  break;
171  case mixed:
172  Proto->Distrib =
173  (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
174  for (i = 0; i < N; i++) {
175  if (tfscanf(File, "%s", Token) != 1)
177  "Illegal prototype distribution");
178  switch (Token[0]) {
179  case 'n':
180  Proto->Distrib[i] = normal;
181  break;
182  case 'u':
183  Proto->Distrib[i] = uniform;
184  break;
185  case 'r':
186  Proto->Distrib[i] = D_random;
187  break;
188  default:
190  "Illegal prototype distribution");
191  }
192  }
193  Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
194  if (Proto->Variance.Elliptical == NULL)
195  DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
196  Proto->Magnitude.Elliptical =
197  (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
198  Proto->Weight.Elliptical =
199  (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
200  Proto->TotalMagnitude = 1.0;
201  for (i = 0; i < N; i++) {
202  switch (Proto->Distrib[i]) {
203  case normal:
204  Proto->Magnitude.Elliptical[i] = 1.0 /
205  sqrt ((double)
206  (2.0 * PI * Proto->Variance.Elliptical[i]));
207  Proto->Weight.Elliptical[i] =
208  1.0 / Proto->Variance.Elliptical[i];
209  break;
210  case uniform:
211  case D_random:
212  Proto->Magnitude.Elliptical[i] = 1.0 /
213  (2.0 * Proto->Variance.Elliptical[i]);
214  break;
215  case DISTRIBUTION_COUNT:
216  ASSERT_HOST(!"Distribution count not allowed!");
217  }
218  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
219  }
220  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
221  break;
222  }
223  return (Proto);
224  }
225  else if (Status == EOF)
226  return (NULL);
227  else {
228  DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification");
229  return (NULL);
230  }
231 }
232 
243  char Token[TOKENSIZE];
244  PROTOSTYLE Style;
245 
246  if (tfscanf(File, "%s", Token) != 1)
247  DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
248  switch (Token[0]) {
249  case 's':
250  Style = spherical;
251  break;
252  case 'e':
253  Style = elliptical;
254  break;
255  case 'm':
256  Style = mixed;
257  break;
258  case 'a':
259  Style = automatic;
260  break;
261  default:
262  Style = elliptical;
263  DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
264  }
265  return (Style);
266 }
267 
282 FLOAT32* ReadNFloats(FILE * File, uinT16 N, FLOAT32 Buffer[]) {
283  bool needs_free = false;
284  int i;
285  int NumFloatsRead;
286 
287  if (Buffer == NULL) {
288  Buffer = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
289  needs_free = true;
290  }
291 
292  for (i = 0; i < N; i++) {
293  NumFloatsRead = tfscanf(File, "%f", &(Buffer[i]));
294  if (NumFloatsRead != 1) {
295  if ((NumFloatsRead == EOF) && (i == 0)) {
296  if (needs_free) {
297  Efree(Buffer);
298  }
299  return NULL;
300  } else {
301  DoError(ILLEGALFLOAT, "Illegal float specification");
302  }
303  }
304  }
305  return Buffer;
306 }
307 
319 void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]) {
320  int i;
321 
322  for (i = 0; i < N; i++) {
323  if (ParamDesc[i].Circular)
324  fprintf (File, "circular ");
325  else
326  fprintf (File, "linear ");
327 
328  if (ParamDesc[i].NonEssential)
329  fprintf (File, "non-essential ");
330  else
331  fprintf (File, "essential ");
332 
333  fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
334  }
335 }
336 
348 void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto) {
349  int i;
350 
351  if (Proto->Significant)
352  fprintf (File, "significant ");
353  else
354  fprintf (File, "insignificant ");
355  WriteProtoStyle (File, (PROTOSTYLE) Proto->Style);
356  fprintf (File, "%6d\n\t", Proto->NumSamples);
357  WriteNFloats (File, N, Proto->Mean);
358  fprintf (File, "\t");
359 
360  switch (Proto->Style) {
361  case spherical:
362  WriteNFloats (File, 1, &(Proto->Variance.Spherical));
363  break;
364  case elliptical:
365  WriteNFloats (File, N, Proto->Variance.Elliptical);
366  break;
367  case mixed:
368  for (i = 0; i < N; i++)
369  switch (Proto->Distrib[i]) {
370  case normal:
371  fprintf (File, " %9s", "normal");
372  break;
373  case uniform:
374  fprintf (File, " %9s", "uniform");
375  break;
376  case D_random:
377  fprintf (File, " %9s", "random");
378  break;
379  case DISTRIBUTION_COUNT:
380  ASSERT_HOST(!"Distribution count not allowed!");
381  }
382  fprintf (File, "\n\t");
383  WriteNFloats (File, N, Proto->Variance.Elliptical);
384  }
385 }
386 
398 void WriteNFloats(FILE * File, uinT16 N, FLOAT32 Array[]) {
399  for (int i = 0; i < N; i++)
400  fprintf(File, " %9.6f", Array[i]);
401  fprintf(File, "\n");
402 }
403 
415 void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
416  switch (ProtoStyle) {
417  case spherical:
418  fprintf (File, "spherical");
419  break;
420  case elliptical:
421  fprintf (File, "elliptical");
422  break;
423  case mixed:
424  fprintf (File, "mixed");
425  break;
426  case automatic:
427  fprintf (File, "automatic");
428  break;
429  }
430 }
431 
449 void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[],
450  LIST ProtoList, BOOL8 WriteSigProtos,
451  BOOL8 WriteInsigProtos) {
452  PROTOTYPE *Proto;
453 
454  /* write file header */
455  fprintf(File,"%0d\n",N);
456  WriteParamDesc(File,N,ParamDesc);
457 
458  /* write prototypes */
459  iterate(ProtoList)
460  {
461  Proto = (PROTOTYPE *) first_node ( ProtoList );
462  if ((Proto->Significant && WriteSigProtos) ||
463  (!Proto->Significant && WriteInsigProtos))
464  WritePrototype(File, N, Proto);
465  }
466 }
#define ILLEGALFLOAT
Definition: clusttool.h:62
CLUSTER * Cluster
Definition: cluster.h:76
#define first_node(l)
Definition: oldlist.h:139
#define TRUE
Definition: capi.h:45
#define ILLEGALCIRCULARSPEC
Definition: clusttool.h:54
#define ILLEGALDISTRIBUTION
Definition: clusttool.h:61
PROTOTYPE * ReadPrototype(FILE *File, uinT16 N)
Definition: clusttool.cpp:114
DISTRIBUTION
Definition: cluster.h:58
#define ILLEGALMINMAXSPEC
Definition: clusttool.h:55
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 Range
Definition: ocrfeatures.h:51
#define ILLEGALSAMPLESIZE
Definition: clusttool.h:53
Definition: cluster.h:59
#define PI
Definition: const.h:19
FLOAT32 LogMagnitude
Definition: cluster.h:80
void Efree(void *ptr)
Definition: emalloc.cpp:79
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
FLOATUNION Variance
Definition: cluster.h:81
#define ILLEGALESSENTIALSPEC
Definition: clusttool.h:63
unsigned char BOOL8
Definition: host.h:46
unsigned Significant
Definition: cluster.h:68
#define ILLEGALSIGNIFICANCESPEC
Definition: clusttool.h:56
unsigned short uinT16
Definition: host.h:34
unsigned NumSamples
Definition: cluster.h:75
#define FALSE
Definition: capi.h:46
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
Definition: cluster.h:45
#define ILLEGALMEANSPEC
Definition: clusttool.h:59
float FLOAT32
Definition: host.h:44
DISTRIBUTION * Distrib
Definition: cluster.h:77
void WriteNFloats(FILE *File, uinT16 N, FLOAT32 Array[])
Definition: clusttool.cpp:398
FLOAT32 Spherical
Definition: cluster.h:63
FLOAT32 * Mean
Definition: cluster.h:78
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
FLOATUNION Magnitude
Definition: cluster.h:82
void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[], LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: clusttool.cpp:449
PARAM_DESC * ReadParamDesc(FILE *File, uinT16 N)
Definition: clusttool.cpp:67
FLOAT32 MidRange
Definition: ocrfeatures.h:53
inT8 Circular
Definition: ocrfeatures.h:47
inT8 NonEssential
Definition: ocrfeatures.h:48
#define TOKENSIZE
Definition: clusttool.cpp:29
uinT16 ReadSampleSize(FILE *File)
Definition: clusttool.cpp:44
FLOAT32 * ReadNFloats(FILE *File, uinT16 N, FLOAT32 Buffer[])
Definition: clusttool.cpp:282
#define iterate(l)
Definition: oldlist.h:159
unsigned Style
Definition: cluster.h:74
FLOAT32 Min
Definition: ocrfeatures.h:49
#define MAXSAMPLESIZE
Definition: clusttool.cpp:30
void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle)
Definition: clusttool.cpp:415
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define ILLEGALSAMPLECOUNT
Definition: clusttool.h:58
#define ILLEGALSTYLESPEC
Definition: clusttool.h:57
#define ILLEGALVARIANCESPEC
Definition: clusttool.h:60
FLOAT32 Max
Definition: ocrfeatures.h:50
PROTOSTYLE ReadProtoStyle(FILE *File)
Definition: clusttool.cpp:242
#define ASSERT_HOST(x)
Definition: errcode.h:84
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:319
FLOATUNION Weight
Definition: cluster.h:83
PROTOSTYLE
Definition: cluster.h:44
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348
FLOAT32 * Elliptical
Definition: cluster.h:64