tesseract  3.05.02
cluster.h File Reference
#include "kdtree.h"
#include "oldlist.h"

Go to the source code of this file.

Classes

struct  sample
 
struct  CLUSTERCONFIG
 
union  FLOATUNION
 
struct  PROTOTYPE
 
struct  CLUSTERER
 
struct  SAMPLELIST
 

Macros

#define MINBUCKETS   5
 
#define MAXBUCKETS   39
 
#define InitSampleSearch(S, C)   (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
 
#define ALREADYCLUSTERED   4000
 

Typedefs

typedef struct sample CLUSTER
 
typedef CLUSTER SAMPLE
 

Enumerations

enum  PROTOSTYLE { spherical, elliptical, mixed, automatic }
 
enum  DISTRIBUTION { normal, uniform, D_random, DISTRIBUTION_COUNT }
 

Functions

CLUSTERER TESS_APIMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLE TESS_APIMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void TESS_API FreeClusterer (CLUSTERER *Clusterer)
 
void TESS_API FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 TESS_API MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Macro Definition Documentation

◆ ALREADYCLUSTERED

#define ALREADYCLUSTERED   4000

Definition at line 133 of file cluster.h.

◆ InitSampleSearch

#define InitSampleSearch (   S,
 
)    (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))

Definition at line 105 of file cluster.h.

◆ MAXBUCKETS

#define MAXBUCKETS   39

Definition at line 27 of file cluster.h.

◆ MINBUCKETS

#define MINBUCKETS   5

Definition at line 26 of file cluster.h.

Typedef Documentation

◆ CLUSTER

typedef struct sample CLUSTER

◆ SAMPLE

typedef CLUSTER SAMPLE

Definition at line 42 of file cluster.h.

Enumeration Type Documentation

◆ DISTRIBUTION

Enumerator
normal 
uniform 
D_random 
DISTRIBUTION_COUNT 

Definition at line 58 of file cluster.h.

◆ PROTOSTYLE

enum PROTOSTYLE
Enumerator
spherical 
elliptical 
mixed 
automatic 

Definition at line 44 of file cluster.h.

44  {
46 } PROTOSTYLE;
Definition: cluster.h:45
PROTOSTYLE
Definition: cluster.h:44

Function Documentation

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 513 of file cluster.cpp.

513  {
514  //only create cluster tree if samples have never been clustered before
515  if (Clusterer->Root == NULL)
516  CreateClusterTree(Clusterer);
517 
518  //deallocate the old prototype list if one exists
519  FreeProtoList (&Clusterer->ProtoList);
520  Clusterer->ProtoList = NIL_LIST;
521 
522  //compute prototypes starting at the root node in the tree
523  ComputePrototypes(Clusterer, Config);
524  // We don't need the cluster pointers in the protos any more, so null them
525  // out, which makes it safe to delete the clusterer.
526  LIST proto_list = Clusterer->ProtoList;
527  iterate(proto_list) {
528  PROTOTYPE *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
529  proto->Cluster = NULL;
530  }
531  return Clusterer->ProtoList;
532 } // ClusterSamples
CLUSTER * Cluster
Definition: cluster.h:76
#define first_node(l)
Definition: oldlist.h:139
CLUSTER * Root
Definition: cluster.h:91
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
LIST ProtoList
Definition: cluster.h:92
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:704
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:930
#define iterate(l)
Definition: oldlist.h:159
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:574

◆ FreeClusterer()

void TESS_API FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 547 of file cluster.cpp.

547  {
548  if (Clusterer != NULL) {
549  memfree (Clusterer->ParamDesc);
550  if (Clusterer->KDTree != NULL)
551  FreeKDTree (Clusterer->KDTree);
552  if (Clusterer->Root != NULL)
553  FreeCluster (Clusterer->Root);
554  // Free up all used buckets structures.
555  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
556  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
557  if (Clusterer->bucket_cache[d][c] != NULL)
558  FreeBuckets(Clusterer->bucket_cache[d][c]);
559  }
560 
561  memfree(Clusterer);
562  }
563 } // FreeClusterer
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define MINBUCKETS
Definition: cluster.h:26
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2189
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2171
void memfree(void *element)
Definition: freelist.cpp:30
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:350
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
#define MAXBUCKETS
Definition: cluster.h:27

◆ FreeProtoList()

void TESS_API FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 574 of file cluster.cpp.

574  {
575  destroy_nodes(*ProtoList, FreePrototype);
576 } // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:588
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:199

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 588 of file cluster.cpp.

588  { //PROTOTYPE *Prototype)
589  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
590 
591  // unmark the corresponding cluster (if there is one
592  if (Prototype->Cluster != NULL)
593  Prototype->Cluster->Prototype = FALSE;
594 
595  // deallocate the prototype statistics and then the prototype itself
596  if (Prototype->Distrib != NULL)
597  memfree (Prototype->Distrib);
598  if (Prototype->Mean != NULL)
599  memfree (Prototype->Mean);
600  if (Prototype->Style != spherical) {
601  if (Prototype->Variance.Elliptical != NULL)
602  memfree (Prototype->Variance.Elliptical);
603  if (Prototype->Magnitude.Elliptical != NULL)
604  memfree (Prototype->Magnitude.Elliptical);
605  if (Prototype->Weight.Elliptical != NULL)
606  memfree (Prototype->Weight.Elliptical);
607  }
608  memfree(Prototype);
609 } // FreePrototype
CLUSTER * Cluster
Definition: cluster.h:76
unsigned Prototype
Definition: cluster.h:34
void memfree(void *element)
Definition: freelist.cpp:30
FLOATUNION Variance
Definition: cluster.h:81
#define FALSE
Definition: capi.h:46
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Magnitude
Definition: cluster.h:82
unsigned Style
Definition: cluster.h:74
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ MakeClusterer()

CLUSTERER TESS_API* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 400 of file cluster.cpp.

400  {
401  CLUSTERER *Clusterer;
402  int i;
403 
404  // allocate main clusterer data structure and init simple fields
405  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
406  Clusterer->SampleSize = SampleSize;
407  Clusterer->NumberOfSamples = 0;
408  Clusterer->NumChar = 0;
409 
410  // init fields which will not be used initially
411  Clusterer->Root = NULL;
412  Clusterer->ProtoList = NIL_LIST;
413 
414  // maintain a copy of param descriptors in the clusterer data structure
415  Clusterer->ParamDesc =
416  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
417  for (i = 0; i < SampleSize; i++) {
418  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
419  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
420  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
421  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
422  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
423  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
424  Clusterer->ParamDesc[i].MidRange =
425  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
426  }
427 
428  // allocate a kd tree to hold the samples
429  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
430 
431  // Initialize cache of histogram buckets to minimize recomputing them.
432  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
433  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
434  Clusterer->bucket_cache[d][c] = NULL;
435  }
436 
437  return Clusterer;
438 } // MakeClusterer
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Range
Definition: ocrfeatures.h:51
KDTREE * KDTree
Definition: cluster.h:90
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NIL_LIST
Definition: oldlist.h:126
#define MINBUCKETS
Definition: cluster.h:26
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:183
LIST ProtoList
Definition: cluster.h:92
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
FLOAT32 MidRange
Definition: ocrfeatures.h:53
inT8 Circular
Definition: ocrfeatures.h:47
inT8 NonEssential
Definition: ocrfeatures.h:48
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
inT32 NumChar
Definition: cluster.h:93
FLOAT32 Min
Definition: ocrfeatures.h:49
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define MAXBUCKETS
Definition: cluster.h:27
FLOAT32 Max
Definition: ocrfeatures.h:50
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87

◆ MakeSample()

SAMPLE TESS_API* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure
Note
Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called
History: 5/29/89, DSJ, Created.

Definition at line 456 of file cluster.cpp.

457  {
458  SAMPLE *Sample;
459  int i;
460 
461  // see if the samples have already been clustered - if so trap an error
462  if (Clusterer->Root != NULL)
464  "Can't add samples after they have been clustered");
465 
466  // allocate the new sample and initialize it
467  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
468  (Clusterer->SampleSize -
469  1) * sizeof (FLOAT32));
470  Sample->Clustered = FALSE;
471  Sample->Prototype = FALSE;
472  Sample->SampleCount = 1;
473  Sample->Left = NULL;
474  Sample->Right = NULL;
475  Sample->CharID = CharID;
476 
477  for (i = 0; i < Clusterer->SampleSize; i++)
478  Sample->Mean[i] = Feature[i];
479 
480  // add the sample to the KD tree - keep track of the total # of samples
481  Clusterer->NumberOfSamples++;
482  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
483  if (CharID >= Clusterer->NumChar)
484  Clusterer->NumChar = CharID + 1;
485 
486  // execute hook for monitoring clustering operation
487  // (*SampleCreationHook)( Sample );
488 
489  return (Sample);
490 } // MakeSample
CLUSTER * Root
Definition: cluster.h:91
#define ALREADYCLUSTERED
Definition: cluster.h:133
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:219
KDTREE * KDTree
Definition: cluster.h:90
inT32 CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
unsigned Clustered
Definition: cluster.h:33
inT32 NumChar
Definition: cluster.h:93
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
void * Emalloc(int Size)
Definition: emalloc.cpp:47
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87

◆ Mean()

FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 650 of file cluster.cpp.

650  {
651  return (Proto->Mean[Dimension]);
652 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78

◆ MergeClusters()

inT32 TESS_API MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.
Note
Exceptions: None
History: 5/31/89, DSJ, Created.

Definition at line 886 of file cluster.cpp.

891  {
892  inT32 i, n;
893 
894  n = n1 + n2;
895  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
896  if (ParamDesc->Circular) {
897  // if distance between means is greater than allowed
898  // reduce upper point by one "rotation" to compute mean
899  // then normalize the mean back into the accepted range
900  if ((*m2 - *m1) > ParamDesc->HalfRange) {
901  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
902  if (*m < ParamDesc->Min)
903  *m += ParamDesc->Range;
904  }
905  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
906  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else
911  *m = (n1 * *m1 + n2 * *m2) / n;
912  }
913  else
914  *m = (n1 * *m1 + n2 * *m2) / n;
915  }
916  return n;
917 } // MergeClusters
int inT32
Definition: host.h:35

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or NULL.
Note
Exceptions: None
History: 6/16/89, DSJ, Created.

Definition at line 626 of file cluster.cpp.

626  {
627  CLUSTER *Cluster;
628 
629  if (*SearchState == NIL_LIST)
630  return (NULL);
631  Cluster = (CLUSTER *) first_node (*SearchState);
632  *SearchState = pop (*SearchState);
633  while (TRUE) {
634  if (Cluster->Left == NULL)
635  return (Cluster);
636  *SearchState = push (*SearchState, Cluster->Right);
637  Cluster = Cluster->Left;
638  }
639 } // NextSample
#define first_node(l)
Definition: oldlist.h:139
#define TRUE
Definition: capi.h:45
#define NIL_LIST
Definition: oldlist.h:126
LIST pop(LIST list)
Definition: oldlist.cpp:299
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36

◆ StandardDeviation()

FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 663 of file cluster.cpp.

663  {
664  switch (Proto->Style) {
665  case spherical:
666  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
667  case elliptical:
668  return ((FLOAT32)
669  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
670  case mixed:
671  switch (Proto->Distrib[Dimension]) {
672  case normal:
673  return ((FLOAT32)
674  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
675  case uniform:
676  case D_random:
677  return (Proto->Variance.Elliptical[Dimension]);
678  case DISTRIBUTION_COUNT:
679  ASSERT_HOST(!"Distribution count not allowed!");
680  }
681  }
682  return 0.0f;
683 } // StandardDeviation
Definition: cluster.h:59
FLOATUNION Variance
Definition: cluster.h:81
Definition: cluster.h:45
float FLOAT32
Definition: host.h:44
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 Spherical
Definition: cluster.h:63
unsigned Style
Definition: cluster.h:74
#define ASSERT_HOST(x)
Definition: errcode.h:84
FLOAT32 * Elliptical
Definition: cluster.h:64