tesseract  3.05.02
cluster.cpp File Reference
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include "helpers.h"
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   ( ( (N) < 0 ) ? ( -(N) ) : (N) )
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   MAX_FLOAT32
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

typedef tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
 
typedef tesseract::GenericHeap< ClusterPairClusterHeap
 
typedef FLOAT64(* DENSITYFUNC) (inT32)
 
typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
inT32 MergeClusters (inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
 
void MakeDimRandom (uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (inT16 N, CLUSTER *Cluster)
 
BOOL8 Independent (PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
uinT16 OptimumNumberOfBuckets (uinT32 SampleCount)
 
FLOAT64 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 NormalDensity (inT32 x)
 
FLOAT64 UniformDensity (inT32 x)
 
FLOAT64 Integral (FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 NormalBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 UniformBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
BOOL8 DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uinT16 DegreesOfFreedom (DISTRIBUTION Distribution, uinT16 HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uinT32 NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 Solve (SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
 
FLOAT64 ChiArea (CHISTRUCT *ChiParams, FLOAT64 x)
 
BOOL8 MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

◆ Abs

#define Abs (   N)    ( ( (N) < 0 ) ? ( -(N) ) : (N) )

Definition at line 208 of file cluster.cpp.

◆ BUCKETTABLESIZE

#define BUCKETTABLESIZE   1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 160 of file cluster.cpp.

◆ CHIACCURACY

#define CHIACCURACY   0.01

◆ DELTARATIO

#define DELTARATIO   0.1

◆ FTABLE_X

#define FTABLE_X   10

Definition at line 31 of file cluster.cpp.

◆ FTABLE_Y

#define FTABLE_Y   100

Definition at line 32 of file cluster.cpp.

◆ HOTELLING

#define HOTELLING   1

Definition at line 30 of file cluster.cpp.

◆ ILLEGAL_CHAR

#define ILLEGAL_CHAR   2

◆ INITIALDELTA

#define INITIALDELTA   0.1

◆ LOOKUPTABLESIZE

#define LOOKUPTABLESIZE   8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 228 of file cluster.cpp.

◆ MAXDEGREESOFFREEDOM

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 229 of file cluster.cpp.

◆ MAXDISTANCE

#define MAXDISTANCE   MAX_FLOAT32

◆ MAXNEIGHBORS

#define MAXNEIGHBORS   2

◆ MINALPHA

#define MINALPHA   (1e-200)

◆ MINSAMPLES

#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 151 of file cluster.cpp.

◆ MINSAMPLESNEEDED

#define MINSAMPLESNEEDED   1

Definition at line 152 of file cluster.cpp.

◆ MINSAMPLESPERBUCKET

#define MINSAMPLESPERBUCKET   5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 150 of file cluster.cpp.

◆ MINVARIANCE

#define MINVARIANCE   0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 142 of file cluster.cpp.

◆ Mirror

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 207 of file cluster.cpp.

◆ NORMALEXTENT

#define NORMALEXTENT   3.0

Definition at line 161 of file cluster.cpp.

◆ Odd

#define Odd (   N)    ((N)%2)

Definition at line 206 of file cluster.cpp.

◆ SqrtOf2Pi

#define SqrtOf2Pi   2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 218 of file cluster.cpp.

Typedef Documentation

◆ ClusterHeap

Definition at line 169 of file cluster.cpp.

◆ ClusterPair

Definition at line 168 of file cluster.cpp.

◆ DENSITYFUNC

typedef FLOAT64(* DENSITYFUNC) (inT32)

Definition at line 203 of file cluster.cpp.

◆ SOLVEFUNC

typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)

Definition at line 204 of file cluster.cpp.

Function Documentation

◆ AdjustBuckets()

void AdjustBuckets ( BUCKETS Buckets,
uinT32  NewSampleCount 
)

This routine multiplies each ExpectedCount histogram entry by NewSampleCount/OldSampleCount so that the histogram is now adjusted to the new sample count.

Parameters
Bucketshistogram data structure to adjust
NewSampleCountnew sample count to adjust to
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2266 of file cluster.cpp.

2266  {
2267  int i;
2268  FLOAT64 AdjustFactor;
2269 
2270  AdjustFactor = (((FLOAT64) NewSampleCount) /
2271  ((FLOAT64) Buckets->SampleCount));
2272 
2273  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2274  Buckets->ExpectedCount[i] *= AdjustFactor;
2275  }
2276 
2277  Buckets->SampleCount = NewSampleCount;
2278 
2279 } // AdjustBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
uinT32 SampleCount
Definition: cluster.cpp:180
double FLOAT64
Definition: host.h:45
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186

◆ AlphaMatch()

int AlphaMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of structures which hold pre-computed chi-squared values for a chi-squared value whose corresponding alpha field matches the alpha field of SearchKey.

It is called by the list search routines.

Parameters
arg1chi-squared struct being tested for a match
arg2chi-squared struct that is the search key
Returns
TRUE if ChiStruct's Alpha matches SearchKey's Alpha
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2312 of file cluster.cpp.

2313  { //CHISTRUCT *SearchKey)
2314  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2315  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2316 
2317  return (ChiStruct->Alpha == SearchKey->Alpha);
2318 
2319 } // AlphaMatch
FLOAT64 Alpha
Definition: cluster.cpp:191

◆ ChiArea()

FLOAT64 ChiArea ( CHISTRUCT ChiParams,
FLOAT64  x 
)

This routine computes the area under a chi density curve from 0 to x, minus the desired area under the curve. The number of degrees of freedom of the chi curve is specified in the ChiParams structure. The desired area is also specified in the ChiParams structure as Alpha ( or 1 minus the desired area ). This routine is intended to be passed to the Solve() function to find the value of chi-squared which will yield a desired area under the right tail of the chi density curve. The function will only work for even degrees of freedom. The equations are based on integrating the chi density curve in parts to obtain a series that can be used to compute the area under the curve.

Parameters
ChiParamscontains degrees of freedom and alpha
xvalue of chi-squared to evaluate
Returns
Error between actual and desired area under the chi curve.
Note
Exceptions: none
History: Fri Aug 4 12:48:41 1989, DSJ, Created.

Definition at line 2424 of file cluster.cpp.

2424  {
2425  int i, N;
2426  FLOAT64 SeriesTotal;
2427  FLOAT64 Denominator;
2428  FLOAT64 PowerOfx;
2429 
2430  N = ChiParams->DegreesOfFreedom / 2 - 1;
2431  SeriesTotal = 1;
2432  Denominator = 1;
2433  PowerOfx = 1;
2434  for (i = 1; i <= N; i++) {
2435  Denominator *= 2 * i;
2436  PowerOfx *= x;
2437  SeriesTotal += PowerOfx / Denominator;
2438  }
2439  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2440 
2441 } // ChiArea
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
FLOAT64 Alpha
Definition: cluster.cpp:191
double FLOAT64
Definition: host.h:45

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 513 of file cluster.cpp.

513  {
514  //only create cluster tree if samples have never been clustered before
515  if (Clusterer->Root == NULL)
516  CreateClusterTree(Clusterer);
517 
518  //deallocate the old prototype list if one exists
519  FreeProtoList (&Clusterer->ProtoList);
520  Clusterer->ProtoList = NIL_LIST;
521 
522  //compute prototypes starting at the root node in the tree
523  ComputePrototypes(Clusterer, Config);
524  // We don't need the cluster pointers in the protos any more, so null them
525  // out, which makes it safe to delete the clusterer.
526  LIST proto_list = Clusterer->ProtoList;
527  iterate(proto_list) {
528  PROTOTYPE *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
529  proto->Cluster = NULL;
530  }
531  return Clusterer->ProtoList;
532 } // ClusterSamples
CLUSTER * Cluster
Definition: cluster.h:76
#define first_node(l)
Definition: oldlist.h:139
CLUSTER * Root
Definition: cluster.h:91
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
LIST ProtoList
Definition: cluster.h:92
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:704
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:930
#define iterate(l)
Definition: oldlist.h:159
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:574

◆ ComputeChiSquared()

FLOAT64 ComputeChiSquared ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine computes the chi-squared value which will leave a cumulative probability of Alpha in the right tail of a chi-squared distribution with the specified number of degrees of freedom. Alpha must be between 0 and 1. DegreesOfFreedom must be even. The routine maintains an array of lists. Each list corresponds to a different number of degrees of freedom. Each entry in the list corresponds to a different alpha value and its corresponding chi-squared value. Therefore, once a particular chi-squared value is computed, it is stored in the list and never needs to be computed again.

Parameters
DegreesOfFreedomdetermines shape of distribution
Alphaprobability of right tail
Returns
Desired chi-squared value
Note
Exceptions: none
History: 6/5/89, DSJ, Created.

Definition at line 1875 of file cluster.cpp.

1878 {
1879  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1880 
1881  CHISTRUCT *OldChiSquared;
1882  CHISTRUCT SearchKey;
1883 
1884  // limit the minimum alpha that can be used - if alpha is too small
1885  // it may not be possible to compute chi-squared.
1886  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1887  if (Odd (DegreesOfFreedom))
1888  DegreesOfFreedom++;
1889 
1890  /* find the list of chi-squared values which have already been computed
1891  for the specified number of degrees of freedom. Search the list for
1892  the desired chi-squared. */
1893  SearchKey.Alpha = Alpha;
1894  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1895  &SearchKey, AlphaMatch));
1896 
1897  if (OldChiSquared == NULL) {
1898  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1899  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1901  (FLOAT64) CHIACCURACY);
1902  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1903  OldChiSquared);
1904  }
1905  else {
1906  // further optimization might move OldChiSquared to front of list
1907  }
1908 
1909  return (OldChiSquared->ChiSquared);
1910 
1911 } // ComputeChiSquared
#define first_node(l)
Definition: oldlist.h:139
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:229
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2332
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2211
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
FLOAT64 ChiSquared
Definition: cluster.cpp:192
FLOAT64 Alpha
Definition: cluster.cpp:191
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2312
#define Odd(N)
Definition: cluster.cpp:206
#define CHIACCURACY
double FLOAT64
Definition: host.h:45
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2424
#define MINALPHA
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2358

◆ ComputePrototypes()

void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 930 of file cluster.cpp.

930  {
931  LIST ClusterStack = NIL_LIST;
932  CLUSTER *Cluster;
933  PROTOTYPE *Prototype;
934 
935  // use a stack to keep track of clusters waiting to be processed
936  // initially the only cluster on the stack is the root cluster
937  if (Clusterer->Root != NULL)
938  ClusterStack = push (NIL_LIST, Clusterer->Root);
939 
940  // loop until we have analyzed all clusters which are potential prototypes
941  while (ClusterStack != NIL_LIST) {
942  // remove the next cluster to be analyzed from the stack
943  // try to make a prototype from the cluster
944  // if successful, put it on the proto list, else split the cluster
945  Cluster = (CLUSTER *) first_node (ClusterStack);
946  ClusterStack = pop (ClusterStack);
947  Prototype = MakePrototype(Clusterer, Config, Cluster);
948  if (Prototype != NULL) {
949  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
950  }
951  else {
952  ClusterStack = push (ClusterStack, Cluster->Right);
953  ClusterStack = push (ClusterStack, Cluster->Left);
954  }
955  }
956 } // ComputePrototypes
#define first_node(l)
Definition: oldlist.h:139
CLUSTER * Root
Definition: cluster.h:91
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
LIST pop(LIST list)
Definition: oldlist.cpp:299
LIST ProtoList
Definition: cluster.h:92
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:975

◆ ComputeStatistics()

STATISTICS * ComputeStatistics ( inT16  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions.

Parameters
Nnumber of dimensions
ParamDescarray of dimension descriptions
Clustercluster whose stats are to be computed
Returns
Pointer to new data structure containing statistics
Note
Exceptions: None
History: 6/2/89, DSJ, Created.

Definition at line 1418 of file cluster.cpp.

1418  {
1419  STATISTICS *Statistics;
1420  int i, j;
1421  FLOAT32 *CoVariance;
1422  FLOAT32 *Distance;
1423  LIST SearchState;
1424  SAMPLE *Sample;
1425  uinT32 SampleCountAdjustedForBias;
1426 
1427  // allocate memory to hold the statistics results
1428  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1429  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1430  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1431  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1432 
1433  // allocate temporary memory to hold the sample to mean distances
1434  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1435 
1436  // initialize the statistics
1437  Statistics->AvgVariance = 1.0;
1438  CoVariance = Statistics->CoVariance;
1439  for (i = 0; i < N; i++) {
1440  Statistics->Min[i] = 0.0;
1441  Statistics->Max[i] = 0.0;
1442  for (j = 0; j < N; j++, CoVariance++)
1443  *CoVariance = 0;
1444  }
1445  // find each sample in the cluster and merge it into the statistics
1446  InitSampleSearch(SearchState, Cluster);
1447  while ((Sample = NextSample (&SearchState)) != NULL) {
1448  for (i = 0; i < N; i++) {
1449  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1450  if (ParamDesc[i].Circular) {
1451  if (Distance[i] > ParamDesc[i].HalfRange)
1452  Distance[i] -= ParamDesc[i].Range;
1453  if (Distance[i] < -ParamDesc[i].HalfRange)
1454  Distance[i] += ParamDesc[i].Range;
1455  }
1456  if (Distance[i] < Statistics->Min[i])
1457  Statistics->Min[i] = Distance[i];
1458  if (Distance[i] > Statistics->Max[i])
1459  Statistics->Max[i] = Distance[i];
1460  }
1461  CoVariance = Statistics->CoVariance;
1462  for (i = 0; i < N; i++)
1463  for (j = 0; j < N; j++, CoVariance++)
1464  *CoVariance += Distance[i] * Distance[j];
1465  }
1466  // normalize the variances by the total number of samples
1467  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1468  // also compute the geometic mean of the diagonal variances
1469  // ensure that clusters with only 1 sample are handled correctly
1470  if (Cluster->SampleCount > 1)
1471  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1472  else
1473  SampleCountAdjustedForBias = 1;
1474  CoVariance = Statistics->CoVariance;
1475  for (i = 0; i < N; i++)
1476  for (j = 0; j < N; j++, CoVariance++) {
1477  *CoVariance /= SampleCountAdjustedForBias;
1478  if (j == i) {
1479  if (*CoVariance < MINVARIANCE)
1480  *CoVariance = MINVARIANCE;
1481  Statistics->AvgVariance *= *CoVariance;
1482  }
1483  }
1484  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1485  1.0 / N);
1486 
1487  // release temporary memory and return
1488  memfree(Distance);
1489  return (Statistics);
1490 } // ComputeStatistics
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 Range
Definition: ocrfeatures.h:51
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define MINVARIANCE
Definition: cluster.cpp:142
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 * CoVariance
Definition: cluster.cpp:173
FLOAT32 * Max
Definition: cluster.cpp:175
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:626
Definition: cluster.h:32
unsigned int uinT32
Definition: host.h:36
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Mean[1]
Definition: cluster.h:39

◆ CreateClusterTree()

void CreateClusterTree ( CLUSTERER Clusterer)

This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples.

Parameters
Clustererdata structure holdings samples to be clustered
Returns
None (the Clusterer data structure is changed)
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 704 of file cluster.cpp.

704  {
705  ClusteringContext context;
706  ClusterPair HeapEntry;
707  TEMPCLUSTER *PotentialCluster;
708 
709  // each sample and its nearest neighbor form a "potential" cluster
710  // save these in a heap with the "best" potential clusters on top
711  context.tree = Clusterer->KDTree;
712  context.candidates = (TEMPCLUSTER *)
713  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
714  context.next = 0;
715  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
716  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
717 
718  // form potential clusters into actual clusters - always do "best" first
719  while (context.heap->Pop(&HeapEntry)) {
720  PotentialCluster = HeapEntry.data;
721 
722  // if main cluster of potential cluster is already in another cluster
723  // then we don't need to worry about it
724  if (PotentialCluster->Cluster->Clustered) {
725  continue;
726  }
727 
728  // if main cluster is not yet clustered, but its nearest neighbor is
729  // then we must find a new nearest neighbor
730  else if (PotentialCluster->Neighbor->Clustered) {
731  PotentialCluster->Neighbor =
732  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
733  &HeapEntry.key);
734  if (PotentialCluster->Neighbor != NULL) {
735  context.heap->Push(&HeapEntry);
736  }
737  }
738 
739  // if neither cluster is already clustered, form permanent cluster
740  else {
741  PotentialCluster->Cluster =
742  MakeNewCluster(Clusterer, PotentialCluster);
743  PotentialCluster->Neighbor =
744  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
745  &HeapEntry.key);
746  if (PotentialCluster->Neighbor != NULL) {
747  context.heap->Push(&HeapEntry);
748  }
749  }
750  }
751 
752  // the root node in the cluster tree is now the only node in the kd-tree
753  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
754 
755  // free up the memory used by the K-D tree, heap, and temp clusters
756  FreeKDTree(context.tree);
757  Clusterer->KDTree = NULL;
758  delete context.heap;
759  memfree(context.candidates);
760 } // CreateClusterTree
CLUSTER * Cluster
Definition: cluster.cpp:164
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
void memfree(void *element)
Definition: freelist.cpp:30
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:771
CLUSTER * Neighbor
Definition: cluster.cpp:165
void Push(Pair *entry)
Definition: genericheap.h:95
void(* void_proc)(...)
Definition: cutil.h:66
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:804
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:350
unsigned Clustered
Definition: cluster.h:33
bool Pop(Pair *entry)
Definition: genericheap.h:118
Definition: cluster.h:32
void * Emalloc(int Size)
Definition: emalloc.cpp:47
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:842
#define RootOf(T)
Definition: kdtree.h:58
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:169
ClusterHeap * heap
Definition: cluster.cpp:197
inT32 NumberOfSamples
Definition: cluster.h:89
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:331

◆ DegreesOfFreedom()

uinT16 DegreesOfFreedom ( DISTRIBUTION  Distribution,
uinT16  HistogramBuckets 
)

This routine computes the degrees of freedom that should be used in a chi-squared test with the specified number of histogram buckets. The result is always rounded up to the next even number so that the value of chi-squared can be computed more easily. This will cause the value of chi-squared to be higher than the optimum value, resulting in the chi-square test being more lenient than optimum.

Parameters
Distributiondistribution being tested for
HistogramBucketsnumber of buckets in chi-square test
Returns
The number of degrees of freedom for a chi-square test
Note
Exceptions: none
History: Thu Aug 3 14:04:18 1989, DSJ, Created.

Definition at line 2211 of file cluster.cpp.

2211  {
2212  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2213 
2214  uinT16 AdjustedNumBuckets;
2215 
2216  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2217  if (Odd (AdjustedNumBuckets))
2218  AdjustedNumBuckets++;
2219  return (AdjustedNumBuckets);
2220 
2221 } // DegreesOfFreedom
unsigned char uinT8
Definition: host.h:32
unsigned short uinT16
Definition: host.h:34
#define Odd(N)
Definition: cluster.cpp:206

◆ DistributionOK()

BOOL8 DistributionOK ( BUCKETS Buckets)

This routine performs a chi-square goodness of fit test on the histogram data in the Buckets data structure. TRUE is returned if the histogram matches the probability distribution which was specified when the Buckets structure was originally created. Otherwise FALSE is returned.

Parameters
Bucketshistogram data to perform chi-square test on
Returns
TRUE if samples match distribution, FALSE otherwise
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2131 of file cluster.cpp.

2131  {
2132  FLOAT32 FrequencyDifference;
2133  FLOAT32 TotalDifference;
2134  int i;
2135 
2136  // compute how well the histogram matches the expected histogram
2137  TotalDifference = 0.0;
2138  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2139  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2140  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2141  Buckets->ExpectedCount[i];
2142  }
2143 
2144  // test to see if the difference is more than expected
2145  if (TotalDifference > Buckets->ChiSquared)
2146  return FALSE;
2147  else
2148  return TRUE;
2149 } // DistributionOK
uinT32 * Count
Definition: cluster.cpp:185
#define TRUE
Definition: capi.h:45
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
FLOAT64 ChiSquared
Definition: cluster.cpp:182

◆ FillBuckets()

void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uinT16  Dim,
PARAM_DESC ParamDesc,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine counts the number of cluster samples which fall within the various histogram buckets in Buckets. Only one dimension of each sample is examined. The exact meaning of the Mean and StdDev parameters depends on the distribution which is being analyzed (this info is in the Buckets data structure). For normal distributions, Mean and StdDev have the expected meanings. For uniform and random distributions the Mean is the center point of the range and the StdDev is 1/2 the range. A dimension with zero standard deviation cannot be statistically analyzed. In this case, a pseudo-analysis is used.

Parameters
Bucketshistogram buckets to count samples
Clustercluster whose samples are being analyzed
Dimdimension of samples which is being analyzed
ParamDescdescription of the dimension
Mean"mean" of the distribution
StdDev"standard deviation" of the distribution
Returns
None (the Buckets data structure is filled in)
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1990 of file cluster.cpp.

1995  {
1996  uinT16 BucketID;
1997  int i;
1998  LIST SearchState;
1999  SAMPLE *Sample;
2000 
2001  // initialize the histogram bucket counts to 0
2002  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2003  Buckets->Count[i] = 0;
2004 
2005  if (StdDev == 0.0) {
2006  /* if the standard deviation is zero, then we can't statistically
2007  analyze the cluster. Use a pseudo-analysis: samples exactly on
2008  the mean are distributed evenly across all buckets. Samples greater
2009  than the mean are placed in the last bucket; samples less than the
2010  mean are placed in the first bucket. */
2011 
2012  InitSampleSearch(SearchState, Cluster);
2013  i = 0;
2014  while ((Sample = NextSample (&SearchState)) != NULL) {
2015  if (Sample->Mean[Dim] > Mean)
2016  BucketID = Buckets->NumberOfBuckets - 1;
2017  else if (Sample->Mean[Dim] < Mean)
2018  BucketID = 0;
2019  else
2020  BucketID = i;
2021  Buckets->Count[BucketID] += 1;
2022  i++;
2023  if (i >= Buckets->NumberOfBuckets)
2024  i = 0;
2025  }
2026  }
2027  else {
2028  // search for all samples in the cluster and add to histogram buckets
2029  InitSampleSearch(SearchState, Cluster);
2030  while ((Sample = NextSample (&SearchState)) != NULL) {
2031  switch (Buckets->Distribution) {
2032  case normal:
2033  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2034  Mean, StdDev);
2035  break;
2036  case D_random:
2037  case uniform:
2038  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2039  Mean, StdDev);
2040  break;
2041  default:
2042  BucketID = 0;
2043  }
2044  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2045  }
2046  }
2047 } // FillBuckets
uinT32 * Count
Definition: cluster.cpp:185
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
DISTRIBUTION Distribution
Definition: cluster.cpp:179
Definition: cluster.h:59
#define InitSampleSearch(S, C)
Definition: cluster.h:105
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2062
unsigned short uinT16
Definition: host.h:34
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:626
Definition: cluster.h:32
FLOAT32 Mean[1]
Definition: cluster.h:39
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2097

◆ FindNearestNeighbor()

CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
FLOAT32 Distance 
)

This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise NULL is returned. The distance between the 2 nodes is placed in the specified variable.

Parameters
Treekd-tree to search in for nearest neighbor
Clustercluster whose nearest neighbor is to be found
Distanceptr to variable to report distance found
Returns
Pointer to the nearest neighbor of Cluster, or NULL
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 804 of file cluster.cpp.

807 {
808  CLUSTER *Neighbor[MAXNEIGHBORS];
809  FLOAT32 Dist[MAXNEIGHBORS];
810  int NumberOfNeighbors;
811  inT32 i;
812  CLUSTER *BestNeighbor;
813 
814  // find the 2 nearest neighbors of the cluster
816  &NumberOfNeighbors, (void **)Neighbor, Dist);
817 
818  // search for the nearest neighbor that is not the cluster itself
819  *Distance = MAXDISTANCE;
820  BestNeighbor = NULL;
821  for (i = 0; i < NumberOfNeighbors; i++) {
822  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
823  *Distance = Dist[i];
824  BestNeighbor = Neighbor[i];
825  }
826  }
827  return BestNeighbor;
828 } // FindNearestNeighbor
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:321
float FLOAT32
Definition: host.h:44
int inT32
Definition: host.h:35
#define MAXNEIGHBORS
Definition: cluster.h:32
#define MAXDISTANCE
FLOAT32 Mean[1]
Definition: cluster.h:39

◆ FreeBuckets()

void FreeBuckets ( BUCKETS buckets)

This routine properly frees the memory used by a BUCKETS.

Parameters
bucketspointer to data structure to be freed

Definition at line 2171 of file cluster.cpp.

2171  {
2172  Efree(buckets->Count);
2173  Efree(buckets->ExpectedCount);
2174  Efree(buckets);
2175 } // FreeBuckets
uinT32 * Count
Definition: cluster.cpp:185
void Efree(void *ptr)
Definition: emalloc.cpp:79
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186

◆ FreeCluster()

void FreeCluster ( CLUSTER Cluster)

This routine frees the memory consumed by the specified cluster and all of its subclusters. This is done by recursive calls to FreeCluster().

Parameters
Clusterpointer to cluster to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 2189 of file cluster.cpp.

2189  {
2190  if (Cluster != NULL) {
2191  FreeCluster (Cluster->Left);
2192  FreeCluster (Cluster->Right);
2193  memfree(Cluster);
2194  }
2195 } // FreeCluster
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2189
void memfree(void *element)
Definition: freelist.cpp:30
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 547 of file cluster.cpp.

547  {
548  if (Clusterer != NULL) {
549  memfree (Clusterer->ParamDesc);
550  if (Clusterer->KDTree != NULL)
551  FreeKDTree (Clusterer->KDTree);
552  if (Clusterer->Root != NULL)
553  FreeCluster (Clusterer->Root);
554  // Free up all used buckets structures.
555  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
556  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
557  if (Clusterer->bucket_cache[d][c] != NULL)
558  FreeBuckets(Clusterer->bucket_cache[d][c]);
559  }
560 
561  memfree(Clusterer);
562  }
563 } // FreeClusterer
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define MINBUCKETS
Definition: cluster.h:26
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2189
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2171
void memfree(void *element)
Definition: freelist.cpp:30
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:350
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
#define MAXBUCKETS
Definition: cluster.h:27

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 574 of file cluster.cpp.

574  {
575  destroy_nodes(*ProtoList, FreePrototype);
576 } // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:588
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:199

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 588 of file cluster.cpp.

588  { //PROTOTYPE *Prototype)
589  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
590 
591  // unmark the corresponding cluster (if there is one
592  if (Prototype->Cluster != NULL)
593  Prototype->Cluster->Prototype = FALSE;
594 
595  // deallocate the prototype statistics and then the prototype itself
596  if (Prototype->Distrib != NULL)
597  memfree (Prototype->Distrib);
598  if (Prototype->Mean != NULL)
599  memfree (Prototype->Mean);
600  if (Prototype->Style != spherical) {
601  if (Prototype->Variance.Elliptical != NULL)
602  memfree (Prototype->Variance.Elliptical);
603  if (Prototype->Magnitude.Elliptical != NULL)
604  memfree (Prototype->Magnitude.Elliptical);
605  if (Prototype->Weight.Elliptical != NULL)
606  memfree (Prototype->Weight.Elliptical);
607  }
608  memfree(Prototype);
609 } // FreePrototype
CLUSTER * Cluster
Definition: cluster.h:76
unsigned Prototype
Definition: cluster.h:34
void memfree(void *element)
Definition: freelist.cpp:30
FLOATUNION Variance
Definition: cluster.h:81
#define FALSE
Definition: capi.h:46
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Magnitude
Definition: cluster.h:82
unsigned Style
Definition: cluster.h:74
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ FreeStatistics()

void FreeStatistics ( STATISTICS Statistics)

This routine frees the memory used by the statistics data structure.

Parameters
Statisticspointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2159 of file cluster.cpp.

2159  {
2160  memfree (Statistics->CoVariance);
2161  memfree (Statistics->Min);
2162  memfree (Statistics->Max);
2163  memfree(Statistics);
2164 } // FreeStatistics
FLOAT32 * Min
Definition: cluster.cpp:174
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * CoVariance
Definition: cluster.cpp:173
FLOAT32 * Max
Definition: cluster.cpp:175

◆ GetBuckets()

BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket.

Parameters
clustererwhich keeps a bucket_cache for us.
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Bucket data structure
Note
Exceptions: none
History: Thu Aug 3 12:58:10 1989, DSJ, Created.

Definition at line 1694 of file cluster.cpp.

1697  {
1698  // Get an old bucket structure with the same number of buckets.
1699  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1700  BUCKETS *Buckets =
1701  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1702 
1703  // If a matching bucket structure is not found, make one and save it.
1704  if (Buckets == NULL) {
1705  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1706  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1707  Buckets;
1708  } else {
1709  // Just adjust the existing buckets.
1710  if (SampleCount != Buckets->SampleCount)
1711  AdjustBuckets(Buckets, SampleCount);
1712  if (Confidence != Buckets->Confidence) {
1713  Buckets->Confidence = Confidence;
1714  Buckets->ChiSquared = ComputeChiSquared(
1715  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1716  Confidence);
1717  }
1718  InitBuckets(Buckets);
1719  }
1720  return Buckets;
1721 } // GetBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2289
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2211
#define MINBUCKETS
Definition: cluster.h:26
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1875
uinT32 SampleCount
Definition: cluster.cpp:180
unsigned short uinT16
Definition: host.h:34
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1838
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1741
FLOAT64 Confidence
Definition: cluster.cpp:181
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2266
FLOAT64 ChiSquared
Definition: cluster.cpp:182

◆ Independent()

BOOL8 Independent ( PARAM_DESC  ParamDesc[],
inT16  N,
FLOAT32 CoVariance,
FLOAT32  Independence 
)

This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true).

Parameters
ParamDescdescriptions of each feature space dimension
Nnumber of dimensions
CoVarianceptr to a covariance matrix
Independencemax off-diagonal correlation coefficient
Returns
TRUE if dimensions are independent, FALSE otherwise
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1647 of file cluster.cpp.

1648  {
1649  int i, j;
1650  FLOAT32 *VARii; // points to ith on-diagonal element
1651  FLOAT32 *VARjj; // points to jth on-diagonal element
1652  FLOAT32 CorrelationCoeff;
1653 
1654  VARii = CoVariance;
1655  for (i = 0; i < N; i++, VARii += N + 1) {
1656  if (ParamDesc[i].NonEssential)
1657  continue;
1658 
1659  VARjj = VARii + N + 1;
1660  CoVariance = VARii + 1;
1661  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1662  if (ParamDesc[j].NonEssential)
1663  continue;
1664 
1665  if ((*VARii == 0.0) || (*VARjj == 0.0))
1666  CorrelationCoeff = 0.0;
1667  else
1668  CorrelationCoeff =
1669  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1670  if (CorrelationCoeff > Independence)
1671  return (FALSE);
1672  }
1673  }
1674  return (TRUE);
1675 } // Independent
#define TRUE
Definition: capi.h:45
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44

◆ InitBuckets()

void InitBuckets ( BUCKETS Buckets)

This routine sets the bucket counts in the specified histogram to zero.

Parameters
Bucketshistogram data structure to init
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2289 of file cluster.cpp.

2289  {
2290  int i;
2291 
2292  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2293  Buckets->Count[i] = 0;
2294  }
2295 
2296 } // InitBuckets
uinT32 * Count
Definition: cluster.cpp:185
uinT16 NumberOfBuckets
Definition: cluster.cpp:183

◆ Integral()

FLOAT64 Integral ( FLOAT64  f1,
FLOAT64  f2,
FLOAT64  Dx 
)

This routine computes a trapezoidal approximation to the integral of a function over a small delta in x.

Parameters
f1value of function at x1
f2value of function at x2
Dxx2 - x1 (should always be positive)
Returns
Approximation of the integral of the function from x1 to x2.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1964 of file cluster.cpp.

1964  {
1965  return (f1 + f2) * Dx / 2.0;
1966 } // Integral

◆ InvertMatrix()

double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Compute the inverse of a matrix using LU decomposition with partial pivoting. The return value is the sum of norms of the off-diagonal terms of the product of a and inv. (A measure of the error.)

Definition at line 2528 of file cluster.cpp.

2528  {
2529  // Allocate memory for the 2D arrays.
2530  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2531  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2532  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2533 
2534  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2535  int row;
2536  int col;
2537  for (row = 0; row < size; row++) {
2538  for (col = 0; col < size; col++) {
2539  U[row][col] = input[row*size + col];
2540  L[row][col] = row == col ? 1.0 : 0.0;
2541  U_inv[row][col] = 0.0;
2542  }
2543  }
2544 
2545  // Compute forward matrix by inversion by LU decomposition of input.
2546  for (col = 0; col < size; ++col) {
2547  // Find best pivot
2548  int best_row = 0;
2549  double best_pivot = -1.0;
2550  for (row = col; row < size; ++row) {
2551  if (Abs(U[row][col]) > best_pivot) {
2552  best_pivot = Abs(U[row][col]);
2553  best_row = row;
2554  }
2555  }
2556  // Exchange pivot rows.
2557  if (best_row != col) {
2558  for (int k = 0; k < size; ++k) {
2559  double tmp = U[best_row][k];
2560  U[best_row][k] = U[col][k];
2561  U[col][k] = tmp;
2562  tmp = L[best_row][k];
2563  L[best_row][k] = L[col][k];
2564  L[col][k] = tmp;
2565  }
2566  }
2567  // Now do the pivot itself.
2568  for (row = col + 1; row < size; ++row) {
2569  double ratio = -U[row][col] / U[col][col];
2570  for (int j = col; j < size; ++j) {
2571  U[row][j] += U[col][j] * ratio;
2572  }
2573  for (int k = 0; k < size; ++k) {
2574  L[row][k] += L[col][k] * ratio;
2575  }
2576  }
2577  }
2578  // Next invert U.
2579  for (col = 0; col < size; ++col) {
2580  U_inv[col][col] = 1.0 / U[col][col];
2581  for (row = col - 1; row >= 0; --row) {
2582  double total = 0.0;
2583  for (int k = col; k > row; --k) {
2584  total += U[row][k] * U_inv[k][col];
2585  }
2586  U_inv[row][col] = -total / U[row][row];
2587  }
2588  }
2589  // Now the answer is U_inv.L.
2590  for (row = 0; row < size; row++) {
2591  for (col = 0; col < size; col++) {
2592  double sum = 0.0;
2593  for (int k = row; k < size; ++k) {
2594  sum += U_inv[row][k] * L[k][col];
2595  }
2596  inv[row*size + col] = sum;
2597  }
2598  }
2599  // Check matrix product.
2600  double error_sum = 0.0;
2601  for (row = 0; row < size; row++) {
2602  for (col = 0; col < size; col++) {
2603  double sum = 0.0;
2604  for (int k = 0; k < size; ++k) {
2605  sum += input[row*size + k] * inv[k *size + col];
2606  }
2607  if (row != col) {
2608  error_sum += Abs(sum);
2609  }
2610  }
2611  }
2612  return error_sum;
2613 }
#define Abs(N)
Definition: cluster.cpp:208

◆ ListEntryMatch()

int ListEntryMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list for a list node whose contents match Key. It is called by the list delete_d routine.

Returns
TRUE if ListNode matches Key
Note
Exceptions: none
History: Thu Aug 3 14:23:58 1989, DSJ, Created.

Definition at line 2250 of file cluster.cpp.

2251  { //Key
2252  return (arg1 == arg2);
2253 
2254 } // ListEntryMatch

◆ MakeBuckets()

BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket.

Parameters
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Pointer to new histogram data structure
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1741 of file cluster.cpp.

1743  {
1744  const DENSITYFUNC DensityFunction[] =
1746  int i, j;
1747  BUCKETS *Buckets;
1748  FLOAT64 BucketProbability;
1749  FLOAT64 NextBucketBoundary;
1750  FLOAT64 Probability;
1751  FLOAT64 ProbabilityDelta;
1752  FLOAT64 LastProbDensity;
1753  FLOAT64 ProbDensity;
1754  uinT16 CurrentBucket;
1755  BOOL8 Symmetrical;
1756 
1757  // allocate memory needed for data structure
1758  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1759  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1760  Buckets->SampleCount = SampleCount;
1761  Buckets->Confidence = Confidence;
1762  Buckets->Count = reinterpret_cast<uinT32*>(
1763  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1764  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1765  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1766 
1767  // initialize simple fields
1768  Buckets->Distribution = Distribution;
1769  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1770  Buckets->Count[i] = 0;
1771  Buckets->ExpectedCount[i] = 0.0;
1772  }
1773 
1774  // all currently defined distributions are symmetrical
1775  Symmetrical = TRUE;
1776  Buckets->ChiSquared = ComputeChiSquared(
1777  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1778 
1779  if (Symmetrical) {
1780  // allocate buckets so that all have approx. equal probability
1781  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1782 
1783  // distribution is symmetric so fill in upper half then copy
1784  CurrentBucket = Buckets->NumberOfBuckets / 2;
1785  if (Odd (Buckets->NumberOfBuckets))
1786  NextBucketBoundary = BucketProbability / 2;
1787  else
1788  NextBucketBoundary = BucketProbability;
1789 
1790  Probability = 0.0;
1791  LastProbDensity =
1792  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1793  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1794  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1795  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1796  Probability += ProbabilityDelta;
1797  if (Probability > NextBucketBoundary) {
1798  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1799  CurrentBucket++;
1800  NextBucketBoundary += BucketProbability;
1801  }
1802  Buckets->Bucket[i] = CurrentBucket;
1803  Buckets->ExpectedCount[CurrentBucket] +=
1804  (FLOAT32) (ProbabilityDelta * SampleCount);
1805  LastProbDensity = ProbDensity;
1806  }
1807  // place any leftover probability into the last bucket
1808  Buckets->ExpectedCount[CurrentBucket] +=
1809  (FLOAT32) ((0.5 - Probability) * SampleCount);
1810 
1811  // copy upper half of distribution to lower half
1812  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1813  Buckets->Bucket[i] =
1814  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1815 
1816  // copy upper half of expected counts to lower half
1817  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1818  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1819  }
1820  return Buckets;
1821 } // MakeBuckets
uinT32 * Count
Definition: cluster.cpp:185
#define TRUE
Definition: capi.h:45
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1945
DISTRIBUTION Distribution
Definition: cluster.cpp:179
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2211
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1875
unsigned char BOOL8
Definition: host.h:46
uinT32 SampleCount
Definition: cluster.cpp:180
#define Mirror(N, R)
Definition: cluster.cpp:207
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:203
unsigned short uinT16
Definition: host.h:34
#define Odd(N)
Definition: cluster.cpp:206
float FLOAT32
Definition: host.h:44
double FLOAT64
Definition: host.h:45
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1838
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1964
unsigned int uinT32
Definition: host.h:36
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT64 Confidence
Definition: cluster.cpp:181
FLOAT64 ChiSquared
Definition: cluster.cpp:182
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1929

◆ MakeClusterer()

CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 400 of file cluster.cpp.

400  {
401  CLUSTERER *Clusterer;
402  int i;
403 
404  // allocate main clusterer data structure and init simple fields
405  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
406  Clusterer->SampleSize = SampleSize;
407  Clusterer->NumberOfSamples = 0;
408  Clusterer->NumChar = 0;
409 
410  // init fields which will not be used initially
411  Clusterer->Root = NULL;
412  Clusterer->ProtoList = NIL_LIST;
413 
414  // maintain a copy of param descriptors in the clusterer data structure
415  Clusterer->ParamDesc =
416  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
417  for (i = 0; i < SampleSize; i++) {
418  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
419  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
420  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
421  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
422  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
423  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
424  Clusterer->ParamDesc[i].MidRange =
425  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
426  }
427 
428  // allocate a kd tree to hold the samples
429  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
430 
431  // Initialize cache of histogram buckets to minimize recomputing them.
432  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
433  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
434  Clusterer->bucket_cache[d][c] = NULL;
435  }
436 
437  return Clusterer;
438 } // MakeClusterer
CLUSTER * Root
Definition: cluster.h:91
FLOAT32 Range
Definition: ocrfeatures.h:51
KDTREE * KDTree
Definition: cluster.h:90
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NIL_LIST
Definition: oldlist.h:126
#define MINBUCKETS
Definition: cluster.h:26
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:183
LIST ProtoList
Definition: cluster.h:92
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
FLOAT32 MidRange
Definition: ocrfeatures.h:53
inT8 Circular
Definition: ocrfeatures.h:47
inT8 NonEssential
Definition: ocrfeatures.h:48
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
inT32 NumChar
Definition: cluster.h:93
FLOAT32 Min
Definition: ocrfeatures.h:49
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define MAXBUCKETS
Definition: cluster.h:27
FLOAT32 Max
Definition: ocrfeatures.h:50
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87

◆ MakeDegenerateProto()

PROTOTYPE * MakeDegenerateProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
inT32  MinSamples 
)

This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it.

If the cluster is not degenerate, NULL is returned.

Parameters
Nnumber of dimensions
Clustercluster being analyzed
Statisticsstatistical info about cluster
Styletype of prototype to be generated
MinSamplesminimum number of samples in a cluster
Returns
Pointer to degenerate prototype or NULL.
Note
Exceptions: None
History: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).

Definition at line 1069 of file cluster.cpp.

1074  {
1075  PROTOTYPE *Proto = NULL;
1076 
1077  if (MinSamples < MINSAMPLESNEEDED)
1078  MinSamples = MINSAMPLESNEEDED;
1079 
1080  if (Cluster->SampleCount < MinSamples) {
1081  switch (Style) {
1082  case spherical:
1083  Proto = NewSphericalProto (N, Cluster, Statistics);
1084  break;
1085  case elliptical:
1086  case automatic:
1087  Proto = NewEllipticalProto (N, Cluster, Statistics);
1088  break;
1089  case mixed:
1090  Proto = NewMixedProto (N, Cluster, Statistics);
1091  break;
1092  }
1093  Proto->Significant = FALSE;
1094  }
1095  return (Proto);
1096 } // MakeDegenerateProto
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1538
#define MINSAMPLESNEEDED
Definition: cluster.cpp:152
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1582
unsigned Significant
Definition: cluster.h:68
#define FALSE
Definition: capi.h:46
Definition: cluster.h:45
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1505
unsigned SampleCount
Definition: cluster.h:35

◆ MakeDimRandom()

void MakeDimRandom ( uinT16  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

This routine alters the ith dimension of the specified mixed prototype to be D_random.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
ParamDescdescription of specified dimension
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1358 of file cluster.cpp.

1358  {
1359  Proto->Distrib[i] = D_random;
1360  Proto->Mean[i] = ParamDesc->MidRange;
1361  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1362 
1363  // subtract out the previous magnitude of this dimension from the total
1364  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1365  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1366  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1367  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1368 
1369  // note that the proto Weight is irrelevant for D_random protos
1370 } // MakeDimRandom
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 Range
Definition: ocrfeatures.h:51
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 MidRange
Definition: ocrfeatures.h:53
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ MakeDimUniform()

void MakeDimUniform ( uinT16  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

This routine alters the ith dimension of the specified mixed prototype to be uniform.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
Statisticsstatistical info about prototype
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1382 of file cluster.cpp.

1382  {
1383  Proto->Distrib[i] = uniform;
1384  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1385  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1386  Proto->Variance.Elliptical[i] =
1387  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1388  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1389  Proto->Variance.Elliptical[i] = MINVARIANCE;
1390 
1391  // subtract out the previous magnitude of this dimension from the total
1392  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1393  Proto->Magnitude.Elliptical[i] =
1394  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1395  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1396  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1397 
1398  // note that the proto Weight is irrelevant for uniform protos
1399 } // MakeDimUniform
CLUSTER * Cluster
Definition: cluster.h:76
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 LogMagnitude
Definition: cluster.h:80
#define MINVARIANCE
Definition: cluster.cpp:142
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Max
Definition: cluster.cpp:175
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ MakeEllipticalProto()

PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new elliptical prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1255 of file cluster.cpp.

1258  {
1259  PROTOTYPE *Proto = NULL;
1260  int i;
1261 
1262  // check that each dimension is a normal distribution
1263  for (i = 0; i < Clusterer->SampleSize; i++) {
1264  if (Clusterer->ParamDesc[i].NonEssential)
1265  continue;
1266 
1267  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1268  Cluster->Mean[i],
1269  sqrt ((FLOAT64) Statistics->
1270  CoVariance[i * (Clusterer->SampleSize + 1)]));
1271  if (!DistributionOK (Buckets))
1272  break;
1273  }
1274  // if all dimensions matched a normal distribution, make a proto
1275  if (i >= Clusterer->SampleSize)
1276  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1277  return (Proto);
1278 } // MakeEllipticalProto
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1538
PARAM_DESC * ParamDesc
Definition: cluster.h:88
double FLOAT64
Definition: host.h:45
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2131
inT8 NonEssential
Definition: ocrfeatures.h:48
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1990
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87

◆ MakeMixedProto()

PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
FLOAT64  Confidence 
)

This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a prototype
Statisticsstatistical info about cluster
NormalBucketshistogram struct used to analyze distribution
Confidenceconfidence level for alternate distributions
Returns
Pointer to new mixed prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1297 of file cluster.cpp.

1301  {
1302  PROTOTYPE *Proto;
1303  int i;
1304  BUCKETS *UniformBuckets = NULL;
1305  BUCKETS *RandomBuckets = NULL;
1306 
1307  // create a mixed proto to work on - initially assume all dimensions normal*/
1308  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1309 
1310  // find the proper distribution for each dimension
1311  for (i = 0; i < Clusterer->SampleSize; i++) {
1312  if (Clusterer->ParamDesc[i].NonEssential)
1313  continue;
1314 
1315  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1316  Proto->Mean[i],
1317  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1318  if (DistributionOK (NormalBuckets))
1319  continue;
1320 
1321  if (RandomBuckets == NULL)
1322  RandomBuckets =
1323  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1324  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1325  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1326  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1327  if (DistributionOK (RandomBuckets))
1328  continue;
1329 
1330  if (UniformBuckets == NULL)
1331  UniformBuckets =
1332  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1333  MakeDimUniform(i, Proto, Statistics);
1334  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1335  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1336  if (DistributionOK (UniformBuckets))
1337  continue;
1338  break;
1339  }
1340  // if any dimension failed to match a distribution, discard the proto
1341  if (i < Clusterer->SampleSize) {
1342  FreePrototype(Proto);
1343  Proto = NULL;
1344  }
1345  return (Proto);
1346 } // MakeMixedProto
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1694
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOATUNION Variance
Definition: cluster.h:81
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1358
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1582
void FreePrototype(void *arg)
Definition: cluster.cpp:588
double FLOAT64
Definition: host.h:45
unsigned SampleCount
Definition: cluster.h:35
FLOAT32 * Mean
Definition: cluster.h:78
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2131
inT8 NonEssential
Definition: ocrfeatures.h:48
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1990
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1382
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ MakeNewCluster()

CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree.

Parameters
Clusterercurrent clustering environment
TempClusterpotential cluster to make permanent
Returns
Pointer to the new permanent cluster
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 842 of file cluster.cpp.

842  {
843  CLUSTER *Cluster;
844 
845  // allocate the new cluster and initialize it
846  Cluster = (CLUSTER *) Emalloc(
847  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
848  Cluster->Clustered = FALSE;
849  Cluster->Prototype = FALSE;
850  Cluster->Left = TempCluster->Cluster;
851  Cluster->Right = TempCluster->Neighbor;
852  Cluster->CharID = -1;
853 
854  // mark the old clusters as "clustered" and delete them from the kd-tree
855  Cluster->Left->Clustered = TRUE;
856  Cluster->Right->Clustered = TRUE;
857  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
858  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
859 
860  // compute the mean and sample count for the new cluster
861  Cluster->SampleCount =
862  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
863  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
864  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
865 
866  // add the new cluster to the KD tree
867  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
868  return Cluster;
869 } // MakeNewCluster
CLUSTER * Cluster
Definition: cluster.cpp:164
#define TRUE
Definition: capi.h:45
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:265
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:219
KDTREE * KDTree
Definition: cluster.h:90
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
CLUSTER * Neighbor
Definition: cluster.cpp:165
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
unsigned Clustered
Definition: cluster.h:33
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
void * Emalloc(int Size)
Definition: emalloc.cpp:47
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])

◆ MakePotentialClusters()

void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
inT32  Level 
)

This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Parameters
contextClusteringContext (see definition above)
Clustercurrent cluster being visited in kd-tree walk
Levellevel of this cluster in the kd-tree

Definition at line 771 of file cluster.cpp.

772  {
773  ClusterPair HeapEntry;
774  int next = context->next;
775  context->candidates[next].Cluster = Cluster;
776  HeapEntry.data = &(context->candidates[next]);
777  context->candidates[next].Neighbor =
778  FindNearestNeighbor(context->tree,
779  context->candidates[next].Cluster,
780  &HeapEntry.key);
781  if (context->candidates[next].Neighbor != NULL) {
782  context->heap->Push(&HeapEntry);
783  context->next++;
784  }
785 } // MakePotentialClusters
CLUSTER * Cluster
Definition: cluster.cpp:164
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
CLUSTER * Neighbor
Definition: cluster.cpp:165
void Push(Pair *entry)
Definition: genericheap.h:95
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:804
ClusterHeap * heap
Definition: cluster.cpp:197

◆ MakePrototype()

PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Clustercluster to be made into a prototype
Returns
Pointer to new prototype or NULL
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 975 of file cluster.cpp.

977  {
978  STATISTICS *Statistics;
979  PROTOTYPE *Proto;
980  BUCKETS *Buckets;
981 
982  // filter out clusters which contain samples from the same character
983  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
984  return NULL;
985 
986  // compute the covariance matrix and ranges for the cluster
987  Statistics =
988  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
989 
990  // check for degenerate clusters which need not be analyzed further
991  // note that the MinSamples test assumes that all clusters with multiple
992  // character samples have been removed (as above)
993  Proto = MakeDegenerateProto(
994  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
995  (inT32) (Config->MinSamples * Clusterer->NumChar));
996  if (Proto != NULL) {
997  FreeStatistics(Statistics);
998  return Proto;
999  }
1000  // check to ensure that all dimensions are independent
1001  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1002  Statistics->CoVariance, Config->Independence)) {
1003  FreeStatistics(Statistics);
1004  return NULL;
1005  }
1006 
1007  if (HOTELLING && Config->ProtoStyle == elliptical) {
1008  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1009  if (Proto != NULL) {
1010  FreeStatistics(Statistics);
1011  return Proto;
1012  }
1013  }
1014 
1015  // create a histogram data structure used to evaluate distributions
1016  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1017  Config->Confidence);
1018 
1019  // create a prototype based on the statistics and test it
1020  switch (Config->ProtoStyle) {
1021  case spherical:
1022  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1023  break;
1024  case elliptical:
1025  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1026  break;
1027  case mixed:
1028  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1029  Config->Confidence);
1030  break;
1031  case automatic:
1032  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1033  if (Proto != NULL)
1034  break;
1035  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1036  if (Proto != NULL)
1037  break;
1038  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1039  Config->Confidence);
1040  break;
1041  }
1042  FreeStatistics(Statistics);
1043  return Proto;
1044 } // MakePrototype
FLOAT64 Confidence
Definition: cluster.h:54
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2471
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1694
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1418
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: cluster.h:59
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1297
CLUSTERCONFIG Config
FLOAT32 Independence
Definition: cluster.h:53
FLOAT32 * CoVariance
Definition: cluster.cpp:173
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1111
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2159
FLOAT32 MinSamples
Definition: cluster.h:50
Definition: cluster.h:45
unsigned SampleCount
Definition: cluster.h:35
int inT32
Definition: host.h:35
#define HOTELLING
Definition: cluster.cpp:30
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1255
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1069
inT32 NumChar
Definition: cluster.h:93
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1218
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1647
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 MaxIllegal
Definition: cluster.h:51

◆ MakeSample()

SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure
Note
Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called
History: 5/29/89, DSJ, Created.

Definition at line 456 of file cluster.cpp.

457  {
458  SAMPLE *Sample;
459  int i;
460 
461  // see if the samples have already been clustered - if so trap an error
462  if (Clusterer->Root != NULL)
464  "Can't add samples after they have been clustered");
465 
466  // allocate the new sample and initialize it
467  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
468  (Clusterer->SampleSize -
469  1) * sizeof (FLOAT32));
470  Sample->Clustered = FALSE;
471  Sample->Prototype = FALSE;
472  Sample->SampleCount = 1;
473  Sample->Left = NULL;
474  Sample->Right = NULL;
475  Sample->CharID = CharID;
476 
477  for (i = 0; i < Clusterer->SampleSize; i++)
478  Sample->Mean[i] = Feature[i];
479 
480  // add the sample to the KD tree - keep track of the total # of samples
481  Clusterer->NumberOfSamples++;
482  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
483  if (CharID >= Clusterer->NumChar)
484  Clusterer->NumChar = CharID + 1;
485 
486  // execute hook for monitoring clustering operation
487  // (*SampleCreationHook)( Sample );
488 
489  return (Sample);
490 } // MakeSample
CLUSTER * Root
Definition: cluster.h:91
#define ALREADYCLUSTERED
Definition: cluster.h:133
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:219
KDTREE * KDTree
Definition: cluster.h:90
inT32 CharID
Definition: cluster.h:38
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
unsigned Clustered
Definition: cluster.h:33
inT32 NumChar
Definition: cluster.h:93
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
void * Emalloc(int Size)
Definition: emalloc.cpp:47
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87

◆ MakeSphericalProto()

PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by a spherical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new spherical prototype or NULL.
Note
Exceptions: None
History: 6/1/89, DSJ, Created.

Definition at line 1218 of file cluster.cpp.

1221  {
1222  PROTOTYPE *Proto = NULL;
1223  int i;
1224 
1225  // check that each dimension is a normal distribution
1226  for (i = 0; i < Clusterer->SampleSize; i++) {
1227  if (Clusterer->ParamDesc[i].NonEssential)
1228  continue;
1229 
1230  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1231  Cluster->Mean[i],
1232  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1233  if (!DistributionOK (Buckets))
1234  break;
1235  }
1236  // if all dimensions matched a normal distribution, make a proto
1237  if (i >= Clusterer->SampleSize)
1238  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1239  return (Proto);
1240 } // MakeSphericalProto
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 AvgVariance
Definition: cluster.cpp:172
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1505
double FLOAT64
Definition: host.h:45
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2131
inT8 NonEssential
Definition: ocrfeatures.h:48
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1990
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87

◆ Mean()

FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 650 of file cluster.cpp.

650  {
651  return (Proto->Mean[Dimension]);
652 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78

◆ MergeClusters() [1/2]

inT32 MergeClusters ( inT16  N,
register PARAM_DESC  ParamDesc[],
register inT32  n1,
register inT32  n2,
register FLOAT32  m[],
register FLOAT32  m1[],
register FLOAT32  m2[] 
)

◆ MergeClusters() [2/2]

inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.
Note
Exceptions: None
History: 5/31/89, DSJ, Created.

Definition at line 886 of file cluster.cpp.

891  {
892  inT32 i, n;
893 
894  n = n1 + n2;
895  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
896  if (ParamDesc->Circular) {
897  // if distance between means is greater than allowed
898  // reduce upper point by one "rotation" to compute mean
899  // then normalize the mean back into the accepted range
900  if ((*m2 - *m1) > ParamDesc->HalfRange) {
901  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
902  if (*m < ParamDesc->Min)
903  *m += ParamDesc->Range;
904  }
905  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
906  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else
911  *m = (n1 * *m1 + n2 * *m2) / n;
912  }
913  else
914  *m = (n1 * *m1 + n2 * *m2) / n;
915  }
916  return n;
917 } // MergeClusters
int inT32
Definition: host.h:35

◆ MultipleCharSamples()

BOOL8 MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
FLOAT32  MaxIllegal 
)

This routine looks at all samples in the specified cluster. It computes a running estimate of the percentage of the charaters which have more than 1 sample in the cluster. When this percentage exceeds MaxIllegal, TRUE is returned. Otherwise FALSE is returned. The CharID fields must contain integers which identify the training characters which were used to generate the sample. One integer is used for each sample. The NumChar field in the Clusterer must contain the number of characters in the training set. All CharID fields must be between 0 and NumChar-1. The main function of this routine is to help identify clusters which need to be split further, i.e. if numerous training characters have 2 or more features which are contained in the same cluster, then the cluster should be split.

Parameters
Clustererdata structure holding cluster tree
Clustercluster containing samples to be tested
MaxIllegalmax percentage of samples allowed to have more than 1 feature in the cluster
Returns
TRUE if the cluster should be split, FALSE otherwise.
Note
Exceptions: none
History: Wed Aug 30 11:13:05 1989, DSJ, Created. 2/22/90, DSJ, Added MaxIllegal control rather than always splitting illegal clusters.

Definition at line 2471 of file cluster.cpp.

2474 {
2475  static BOOL8 *CharFlags = NULL;
2476  static inT32 NumFlags = 0;
2477  int i;
2478  LIST SearchState;
2479  SAMPLE *Sample;
2480  inT32 CharID;
2481  inT32 NumCharInCluster;
2482  inT32 NumIllegalInCluster;
2483  FLOAT32 PercentIllegal;
2484 
2485  // initial estimate assumes that no illegal chars exist in the cluster
2486  NumCharInCluster = Cluster->SampleCount;
2487  NumIllegalInCluster = 0;
2488 
2489  if (Clusterer->NumChar > NumFlags) {
2490  if (CharFlags != NULL)
2491  memfree(CharFlags);
2492  NumFlags = Clusterer->NumChar;
2493  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2494  }
2495 
2496  for (i = 0; i < NumFlags; i++)
2497  CharFlags[i] = FALSE;
2498 
2499  // find each sample in the cluster and check if we have seen it before
2500  InitSampleSearch(SearchState, Cluster);
2501  while ((Sample = NextSample (&SearchState)) != NULL) {
2502  CharID = Sample->CharID;
2503  if (CharFlags[CharID] == FALSE) {
2504  CharFlags[CharID] = TRUE;
2505  }
2506  else {
2507  if (CharFlags[CharID] == TRUE) {
2508  NumIllegalInCluster++;
2509  CharFlags[CharID] = ILLEGAL_CHAR;
2510  }
2511  NumCharInCluster--;
2512  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2513  if (PercentIllegal > MaxIllegal) {
2514  destroy(SearchState);
2515  return (TRUE);
2516  }
2517  }
2518  }
2519  return (FALSE);
2520 
2521 } // MultipleCharSamples
#define TRUE
Definition: capi.h:45
inT32 CharID
Definition: cluster.h:38
#define InitSampleSearch(S, C)
Definition: cluster.h:105
void memfree(void *element)
Definition: freelist.cpp:30
#define ILLEGAL_CHAR
unsigned char BOOL8
Definition: host.h:46
#define FALSE
Definition: capi.h:46
LIST destroy(LIST list)
Definition: oldlist.cpp:182
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
int inT32
Definition: host.h:35
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:626
inT32 NumChar
Definition: cluster.h:93
Definition: cluster.h:32
void * Emalloc(int Size)
Definition: emalloc.cpp:47

◆ NewChiStruct()

CHISTRUCT * NewChiStruct ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine allocates a new data structure which is used to hold a chi-squared value along with its associated number of degrees of freedom and alpha value.

Parameters
DegreesOfFreedomdegrees of freedom for new chi value
Alphaconfidence level for new chi value
Returns
none
Note
Exceptions: none
History: Fri Aug 4 11:04:59 1989, DSJ, Created.

Definition at line 2332 of file cluster.cpp.

2332  {
2334 
2335  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2337  NewChiStruct->Alpha = Alpha;
2338  return (NewChiStruct);
2339 
2340 } // NewChiStruct
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2332
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2211
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
FLOAT64 Alpha
Definition: cluster.cpp:191
void * Emalloc(int Size)
Definition: emalloc.cpp:47

◆ NewEllipticalProto()

PROTOTYPE * NewEllipticalProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new elliptical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1538 of file cluster.cpp.

1540  {
1541  PROTOTYPE *Proto;
1542  FLOAT32 *CoVariance;
1543  int i;
1544 
1545  Proto = NewSimpleProto (N, Cluster);
1546  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1547  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1548  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1549 
1550  CoVariance = Statistics->CoVariance;
1551  Proto->TotalMagnitude = 1.0;
1552  for (i = 0; i < N; i++, CoVariance += N + 1) {
1553  Proto->Variance.Elliptical[i] = *CoVariance;
1554  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1555  Proto->Variance.Elliptical[i] = MINVARIANCE;
1556 
1557  Proto->Magnitude.Elliptical[i] =
1558  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1559  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1560  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1561  }
1562  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1563  Proto->Style = elliptical;
1564  return (Proto);
1565 } // NewEllipticalProto
FLOAT32 TotalMagnitude
Definition: cluster.h:79
#define PI
Definition: const.h:19
FLOAT32 LogMagnitude
Definition: cluster.h:80
#define MINVARIANCE
Definition: cluster.cpp:142
FLOATUNION Variance
Definition: cluster.h:81
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1606
FLOAT32 * CoVariance
Definition: cluster.cpp:173
float FLOAT32
Definition: host.h:44
FLOATUNION Magnitude
Definition: cluster.h:82
unsigned Style
Definition: cluster.h:74
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ NewMixedProto()

PROTOTYPE * NewMixedProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines.

Parameters
Nnumber of dimensions
Clustercluster to be made into a mixed prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new mixed prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1582 of file cluster.cpp.

1582  {
1583  PROTOTYPE *Proto;
1584  int i;
1585 
1586  Proto = NewEllipticalProto (N, Cluster, Statistics);
1587  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1588 
1589  for (i = 0; i < N; i++) {
1590  Proto->Distrib[i] = normal;
1591  }
1592  Proto->Style = mixed;
1593  return (Proto);
1594 } // NewMixedProto
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1538
DISTRIBUTION
Definition: cluster.h:58
Definition: cluster.h:59
Definition: cluster.h:45
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
void * Emalloc(int Size)
Definition: emalloc.cpp:47

◆ NewSimpleProto()

PROTOTYPE * NewSimpleProto ( inT16  N,
CLUSTER Cluster 
)

This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension.

Parameters
Nnumber of dimensions
Clustercluster to be made into a prototype
Returns
Pointer to new simple prototype
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1606 of file cluster.cpp.

1606  {
1607  PROTOTYPE *Proto;
1608  int i;
1609 
1610  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1611  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1612 
1613  for (i = 0; i < N; i++)
1614  Proto->Mean[i] = Cluster->Mean[i];
1615  Proto->Distrib = NULL;
1616 
1617  Proto->Significant = TRUE;
1618  Proto->Merged = FALSE;
1619  Proto->Style = spherical;
1620  Proto->NumSamples = Cluster->SampleCount;
1621  Proto->Cluster = Cluster;
1622  Proto->Cluster->Prototype = TRUE;
1623  return (Proto);
1624 } // NewSimpleProto
CLUSTER * Cluster
Definition: cluster.h:76
#define TRUE
Definition: capi.h:45
unsigned Merged
Definition: cluster.h:69
unsigned Prototype
Definition: cluster.h:34
unsigned Significant
Definition: cluster.h:68
unsigned NumSamples
Definition: cluster.h:75
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:44
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned SampleCount
Definition: cluster.h:35
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Style
Definition: cluster.h:74
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Mean[1]
Definition: cluster.h:39

◆ NewSphericalProto()

PROTOTYPE * NewSphericalProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new spherical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1505 of file cluster.cpp.

1507  {
1508  PROTOTYPE *Proto;
1509 
1510  Proto = NewSimpleProto (N, Cluster);
1511 
1512  Proto->Variance.Spherical = Statistics->AvgVariance;
1513  if (Proto->Variance.Spherical < MINVARIANCE)
1514  Proto->Variance.Spherical = MINVARIANCE;
1515 
1516  Proto->Magnitude.Spherical =
1517  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1518  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1519  (double) N);
1520  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1521  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1522 
1523  return (Proto);
1524 } // NewSphericalProto
FLOAT32 TotalMagnitude
Definition: cluster.h:79
#define PI
Definition: const.h:19
FLOAT32 LogMagnitude
Definition: cluster.h:80
#define MINVARIANCE
Definition: cluster.cpp:142
FLOATUNION Variance
Definition: cluster.h:81
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1606
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 Spherical
Definition: cluster.h:63
FLOATUNION Magnitude
Definition: cluster.h:82
FLOATUNION Weight
Definition: cluster.h:83

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or NULL.
Note
Exceptions: None
History: 6/16/89, DSJ, Created.

Definition at line 626 of file cluster.cpp.

626  {
627  CLUSTER *Cluster;
628 
629  if (*SearchState == NIL_LIST)
630  return (NULL);
631  Cluster = (CLUSTER *) first_node (*SearchState);
632  *SearchState = pop (*SearchState);
633  while (TRUE) {
634  if (Cluster->Left == NULL)
635  return (Cluster);
636  *SearchState = push (*SearchState, Cluster->Right);
637  Cluster = Cluster->Left;
638  }
639 } // NextSample
#define first_node(l)
Definition: oldlist.h:139
#define TRUE
Definition: capi.h:45
#define NIL_LIST
Definition: oldlist.h:126
LIST pop(LIST list)
Definition: oldlist.cpp:299
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36

◆ NormalBucket()

uinT16 NormalBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete normal distribution defined by kNormalMean and kNormalStdDev. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meanmean of normal distribution
StdDevstandard deviation of normal distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2062 of file cluster.cpp.

2065  {
2066  FLOAT32 X;
2067 
2068  // wraparound circular parameters if necessary
2069  if (ParamDesc->Circular) {
2070  if (x - Mean > ParamDesc->HalfRange)
2071  x -= ParamDesc->Range;
2072  else if (x - Mean < -ParamDesc->HalfRange)
2073  x += ParamDesc->Range;
2074  }
2075 
2076  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2077  if (X < 0)
2078  return 0;
2079  if (X > BUCKETTABLESIZE - 1)
2080  return ((uinT16) (BUCKETTABLESIZE - 1));
2081  return (uinT16) floor((FLOAT64) X);
2082 } // NormalBucket
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
FLOAT32 Range
Definition: ocrfeatures.h:51
unsigned short uinT16
Definition: host.h:34
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
float FLOAT32
Definition: host.h:44
double FLOAT64
Definition: host.h:45
inT8 Circular
Definition: ocrfeatures.h:47
#define BUCKETTABLESIZE
Definition: cluster.cpp:160

◆ NormalDensity()

FLOAT64 NormalDensity ( inT32  x)

This routine computes the probability density function of a discrete normal distribution defined by the global variables kNormalMean, kNormalVariance, and kNormalMagnitude. Normal magnitude could, of course, be computed in terms of the normal variance but it is precomputed for efficiency.

Parameters
xnumber to compute the normal probability density for
Note
Globals: kNormalMean mean of a discrete normal distribution kNormalVariance variance of a discrete normal distribution kNormalMagnitude magnitude of a discrete normal distribution
Returns
The value of the normal distribution at x.
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1929 of file cluster.cpp.

1929  {
1930  FLOAT64 Distance;
1931 
1932  Distance = x - kNormalMean;
1933  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1934 } // NormalDensity
double FLOAT64
Definition: host.h:45

◆ NumBucketsMatch()

int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of histogram data structures to find one with the specified number of buckets. It is called by the list search routines.

Parameters
arg1current histogram being tested for a match
arg2match key
Returns
TRUE if arg1 matches arg2
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2233 of file cluster.cpp.

2234  { // uinT16 *DesiredNumberOfBuckets)
2235  BUCKETS *Histogram = (BUCKETS *) arg1;
2236  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2237 
2238  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2239 
2240 } // NumBucketsMatch
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
unsigned short uinT16
Definition: host.h:34

◆ OptimumNumberOfBuckets()

uinT16 OptimumNumberOfBuckets ( uinT32  SampleCount)

This routine computes the optimum number of histogram buckets that should be used in a chi-squared goodness of fit test for the specified number of samples. The optimum number is computed based on Table 4.1 on pg. 147 of "Measurement and Analysis of Random Data" by Bendat & Piersol. Linear interpolation is used to interpolate between table values. The table is intended for a 0.05 level of significance (alpha). This routine assumes that it is equally valid for other alpha's, which may not be true.

Parameters
SampleCountnumber of samples to be tested
Returns
Optimum number of histogram buckets
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1838 of file cluster.cpp.

1838  {
1839  uinT8 Last, Next;
1840  FLOAT32 Slope;
1841 
1842  if (SampleCount < kCountTable[0])
1843  return kBucketsTable[0];
1844 
1845  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1846  if (SampleCount <= kCountTable[Next]) {
1847  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1848  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1849  return ((uinT16) (kBucketsTable[Last] +
1850  Slope * (SampleCount - kCountTable[Last])));
1851  }
1852  }
1853  return kBucketsTable[Last];
1854 } // OptimumNumberOfBuckets
unsigned char uinT8
Definition: host.h:32
unsigned short uinT16
Definition: host.h:34
float FLOAT32
Definition: host.h:44
#define LOOKUPTABLESIZE
Definition: cluster.cpp:228

◆ Solve()

FLOAT64 Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
FLOAT64  InitialGuess,
FLOAT64  Accuracy 
)

This routine attempts to find an x value at which Function goes to zero (i.e. a root of the function ). It will only work correctly if a solution actually exists and there are no extrema between the solution and the InitialGuess. The algorithms used are extremely primitive.

Parameters
Functionfunction whose zero is to be found
FunctionParamsarbitrary data to pass to function
InitialGuesspoint to start solution search at
Accuracymaximum allowed error
Returns
Solution of function ( x for which f(x) = 0 ).
Note
Exceptions: none
History: Fri Aug 4 11:08:59 1989, DSJ, Created.

Definition at line 2358 of file cluster.cpp.

2362 {
2363  FLOAT64 x;
2364  FLOAT64 f;
2365  FLOAT64 Slope;
2366  FLOAT64 Delta;
2367  FLOAT64 NewDelta;
2368  FLOAT64 xDelta;
2369  FLOAT64 LastPosX, LastNegX;
2370 
2371  x = InitialGuess;
2372  Delta = INITIALDELTA;
2373  LastPosX = MAX_FLOAT32;
2374  LastNegX = -MAX_FLOAT32;
2375  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2376  while (Abs (LastPosX - LastNegX) > Accuracy) {
2377  // keep track of outer bounds of current estimate
2378  if (f < 0)
2379  LastNegX = x;
2380  else
2381  LastPosX = x;
2382 
2383  // compute the approx. slope of f(x) at the current point
2384  Slope =
2385  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2386 
2387  // compute the next solution guess */
2388  xDelta = f / Slope;
2389  x -= xDelta;
2390 
2391  // reduce the delta used for computing slope to be a fraction of
2392  //the amount moved to get to the new guess
2393  NewDelta = Abs (xDelta) * DELTARATIO;
2394  if (NewDelta < Delta)
2395  Delta = NewDelta;
2396 
2397  // compute the value of the function at the new guess
2398  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2399  }
2400  return (x);
2401 
2402 } // Solve
#define DELTARATIO
#define MAX_FLOAT32
Definition: host.h:57
double FLOAT64
Definition: host.h:45
#define INITIALDELTA
#define Abs(N)
Definition: cluster.cpp:208

◆ StandardDeviation()

FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 663 of file cluster.cpp.

663  {
664  switch (Proto->Style) {
665  case spherical:
666  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
667  case elliptical:
668  return ((FLOAT32)
669  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
670  case mixed:
671  switch (Proto->Distrib[Dimension]) {
672  case normal:
673  return ((FLOAT32)
674  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
675  case uniform:
676  case D_random:
677  return (Proto->Variance.Elliptical[Dimension]);
678  case DISTRIBUTION_COUNT:
679  ASSERT_HOST(!"Distribution count not allowed!");
680  }
681  }
682  return 0.0f;
683 } // StandardDeviation
Definition: cluster.h:59
FLOATUNION Variance
Definition: cluster.h:81
Definition: cluster.h:45
float FLOAT32
Definition: host.h:44
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 Spherical
Definition: cluster.h:63
unsigned Style
Definition: cluster.h:74
#define ASSERT_HOST(x)
Definition: errcode.h:84
FLOAT32 * Elliptical
Definition: cluster.h:64

◆ TestEllipticalProto()

PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Configprovides the magic number of samples that make a good cluster
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Returns
Pointer to new elliptical prototype or NULL.

Definition at line 1111 of file cluster.cpp.

1114  {
1115  // Fraction of the number of samples used as a range around 1 within
1116  // which a cluster has the magic size that allows a boost to the
1117  // FTable by kFTableBoostMargin, thus allowing clusters near the
1118  // magic size (equal to the number of sample characters) to be more
1119  // likely to stay together.
1120  const double kMagicSampleMargin = 0.0625;
1121  const double kFTableBoostMargin = 2.0;
1122 
1123  int N = Clusterer->SampleSize;
1124  CLUSTER* Left = Cluster->Left;
1125  CLUSTER* Right = Cluster->Right;
1126  if (Left == NULL || Right == NULL)
1127  return NULL;
1128  int TotalDims = Left->SampleCount + Right->SampleCount;
1129  if (TotalDims < N + 1 || TotalDims < 2)
1130  return NULL;
1131  const int kMatrixSize = N * N * sizeof(FLOAT32);
1132  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1133  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1134  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1135  // Compute a new covariance matrix that only uses essential features.
1136  for (int i = 0; i < N; ++i) {
1137  int row_offset = i * N;
1138  if (!Clusterer->ParamDesc[i].NonEssential) {
1139  for (int j = 0; j < N; ++j) {
1140  if (!Clusterer->ParamDesc[j].NonEssential)
1141  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1142  else
1143  Covariance[j + row_offset] = 0.0f;
1144  }
1145  } else {
1146  for (int j = 0; j < N; ++j) {
1147  if (i == j)
1148  Covariance[j + row_offset] = 1.0f;
1149  else
1150  Covariance[j + row_offset] = 0.0f;
1151  }
1152  }
1153  }
1154  double err = InvertMatrix(Covariance, N, Inverse);
1155  if (err > 1) {
1156  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1157  }
1158  int EssentialN = 0;
1159  for (int dim = 0; dim < N; ++dim) {
1160  if (!Clusterer->ParamDesc[dim].NonEssential) {
1161  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1162  ++EssentialN;
1163  } else {
1164  Delta[dim] = 0.0f;
1165  }
1166  }
1167  // Compute Hotelling's T-squared.
1168  double Tsq = 0.0;
1169  for (int x = 0; x < N; ++x) {
1170  double temp = 0.0;
1171  for (int y = 0; y < N; ++y) {
1172  temp += Inverse[y + N*x] * Delta[y];
1173  }
1174  Tsq += Delta[x] * temp;
1175  }
1176  memfree(Covariance);
1177  memfree(Inverse);
1178  memfree(Delta);
1179  // Changed this function to match the formula in
1180  // Statistical Methods in Medical Research p 473
1181  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1182  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1183  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1184  int Fx = EssentialN;
1185  if (Fx > FTABLE_X)
1186  Fx = FTABLE_X;
1187  --Fx;
1188  int Fy = TotalDims - EssentialN - 1;
1189  if (Fy > FTABLE_Y)
1190  Fy = FTABLE_Y;
1191  --Fy;
1192  double FTarget = FTable[Fy][Fx];
1193  if (Config->MagicSamples > 0 &&
1194  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1195  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1196  // Give magic-sized clusters a magic FTable boost.
1197  FTarget += kFTableBoostMargin;
1198  }
1199  if (F < FTarget) {
1200  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1201  }
1202  return NULL;
1203 }
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:35
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1538
PARAM_DESC * ParamDesc
Definition: cluster.h:88
CLUSTERCONFIG Config
#define FTABLE_Y
Definition: cluster.cpp:32
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * CoVariance
Definition: cluster.cpp:173
#define FTABLE_X
Definition: cluster.cpp:31
float FLOAT32
Definition: host.h:44
unsigned SampleCount
Definition: cluster.h:35
int MagicSamples
Definition: cluster.h:55
#define tprintf(...)
Definition: tprintf.h:31
inT8 NonEssential
Definition: ocrfeatures.h:48
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2528
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
void * Emalloc(int Size)
Definition: emalloc.cpp:47
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87

◆ UniformBucket()

uinT16 UniformBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete uniform distribution defined by BUCKETTABLESIZE. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meancenter of range of uniform distribution
StdDev1/2 the range of the uniform distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2097 of file cluster.cpp.

2100  {
2101  FLOAT32 X;
2102 
2103  // wraparound circular parameters if necessary
2104  if (ParamDesc->Circular) {
2105  if (x - Mean > ParamDesc->HalfRange)
2106  x -= ParamDesc->Range;
2107  else if (x - Mean < -ParamDesc->HalfRange)
2108  x += ParamDesc->Range;
2109  }
2110 
2111  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2112  if (X < 0)
2113  return 0;
2114  if (X > BUCKETTABLESIZE - 1)
2115  return (uinT16) (BUCKETTABLESIZE - 1);
2116  return (uinT16) floor((FLOAT64) X);
2117 } // UniformBucket
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
FLOAT32 Range
Definition: ocrfeatures.h:51
unsigned short uinT16
Definition: host.h:34
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
float FLOAT32
Definition: host.h:44
double FLOAT64
Definition: host.h:45
inT8 Circular
Definition: ocrfeatures.h:47
#define BUCKETTABLESIZE
Definition: cluster.cpp:160

◆ UniformDensity()

FLOAT64 UniformDensity ( inT32  x)

This routine computes the probability density function of a uniform distribution at the specified point. The range of the distribution is from 0 to BUCKETTABLESIZE.

Parameters
xnumber to compute the uniform probability density for
Returns
The value of the uniform distribution at x.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1945 of file cluster.cpp.

1945  {
1946  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1947 
1948  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1949  return UniformDistributionDensity;
1950  else
1951  return (FLOAT64) 0.0;
1952 } // UniformDensity
double FLOAT64
Definition: host.h:45
#define BUCKETTABLESIZE
Definition: cluster.cpp:160

Variable Documentation

◆ FTable

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 35 of file cluster.cpp.