16 #include "allheaders.h" 66 "Min number of samples per proto as % of total");
68 "Max percentage of samples in a cluster which have more" 69 " than 1 feature in that cluster");
71 "Desired independence between dimensions");
73 "Desired confidence in prototypes created");
92 usage +=
" [.tr files ...]";
99 MAX(0.0,
MIN(1.0,
double(FLAGS_clusterconfig_min_samples_fraction)));
101 MAX(0.0,
MIN(1.0,
double(FLAGS_clusterconfig_max_illegal)));
103 MAX(0.0,
MIN(1.0,
double(FLAGS_clusterconfig_independence)));
105 MAX(0.0,
MIN(1.0,
double(FLAGS_clusterconfig_confidence)));
107 if (!FLAGS_configfile.empty()) {
109 FLAGS_configfile.c_str(),
119 STRING shape_table_file = file_prefix;
120 shape_table_file += kShapeTableFileSuffix;
121 FILE* shape_fp = fopen(shape_table_file.
string(),
"rb");
122 if (shape_fp != NULL) {
127 tprintf(
"Error: Failed to read shape table %s\n",
128 shape_table_file.
string());
130 int num_shapes = shape_table->
NumShapes();
131 tprintf(
"Read shape table %s of %d shapes\n",
132 shape_table_file.
string(), num_shapes);
136 tprintf(
"Warning: No shape table file present: %s\n",
137 shape_table_file.
string());
144 STRING shape_table_file = file_prefix;
145 shape_table_file += kShapeTableFileSuffix;
146 FILE* fp = fopen(shape_table_file.
string(),
"wb");
149 fprintf(stderr,
"Error writing shape table: %s\n",
150 shape_table_file.
string());
154 fprintf(stderr,
"Error creating shape table: %s\n",
155 shape_table_file.
string());
181 if (!FLAGS_D.empty()) {
182 *file_prefix += FLAGS_D.
c_str();
189 bool shape_analysis =
false;
190 if (shape_table != NULL) {
192 if (*shape_table != NULL)
193 shape_analysis =
true;
195 shape_analysis =
true;
203 if (FLAGS_T.empty()) {
206 if (!FLAGS_F.empty()) {
212 if (!FLAGS_X.empty()) {
219 const char* page_name;
222 tprintf(
"Reading %s ...\n", page_name);
227 int pagename_len = strlen(page_name);
228 char *fontinfo_file_name =
new char[pagename_len + 7];
229 strncpy(fontinfo_file_name, page_name, pagename_len - 2);
230 strcpy(fontinfo_file_name + pagename_len - 2,
"fontinfo");
232 delete[] fontinfo_file_name;
235 if (FLAGS_load_images) {
236 STRING image_name = page_name;
245 if (!FLAGS_output_trainer.empty()) {
246 FILE* fp = fopen(FLAGS_output_trainer.c_str(),
"wb");
248 tprintf(
"Can't create saved trainer data!\n");
255 bool success =
false;
256 tprintf(
"Loading master trainer from file:%s\n",
258 FILE* fp = fopen(FLAGS_T.c_str(),
"rb");
260 tprintf(
"Can't read file %s to initialize master trainer\n",
267 tprintf(
"Deserialize of master trainer failed!\n");
274 if (!FLAGS_O.empty() &&
276 fprintf(stderr,
"Failed to save unicharset to file %s\n", FLAGS_O.c_str());
280 if (shape_table != NULL) {
283 if (*shape_table == NULL) {
286 tprintf(
"Flat shape table summary: %s\n",
287 (*shape_table)->SummaryStr().string());
289 (*shape_table)->set_unicharset(trainer->
unicharset());
334 if (strcmp (LabeledList->
Label, Label) == 0)
335 return (LabeledList);
356 strcpy (LabeledList->
Label, Label);
360 return (LabeledList);
387 const char *feature_name,
int max_samples,
389 FILE* file,
LIST* training_samples) {
398 LIST it = *training_samples;
404 while (fgets(buffer, 2048, file) != NULL) {
405 if (buffer[0] ==
'\n')
408 sscanf(buffer,
"%*s %s", unichar);
412 tprintf(
"Error: Size of unicharset in training is " 413 "greater than MAX_NUM_CLASSES\n");
417 char_sample =
FindList(*training_samples, unichar);
418 if (char_sample == NULL) {
420 *training_samples =
push(*training_samples, char_sample);
423 feature_samples = char_desc->
FeatureSets[feature_type];
425 char_sample->
List =
push(char_sample->
List, feature_samples);
432 if (feature_type != i)
455 LIST nodes = CharList;
458 FeatureList = char_sample->
List;
481 free(LabeledList->
Label);
501 const char* program_feature_type) {
507 LIST FeatureList = NULL;
514 FeatureList = char_sample->
List;
521 for (j = 0; j < N; j++)
536 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
538 LIST pProtoList = ProtoList;
546 LIST list_it = ProtoList;
549 if (test_p != Prototype && !test_p->
Merged) {
553 if (dist < best_dist) {
559 if (best_match != NULL && !best_match->
Significant) {
561 tprintf(
"Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
563 best_match->
Mean[0], best_match->
Mean[1],
564 Prototype->
Mean[0], Prototype->
Mean[1]);
573 }
else if (best_match != NULL) {
575 tprintf(
"Red proto at %g,%g matched a green one at %g,%g\n",
576 Prototype->
Mean[0], Prototype->
Mean[1],
577 best_match->
Mean[0], best_match->
Mean[1]);
583 pProtoList = ProtoList;
590 tprintf(
"Red proto at %g,%g becoming green\n",
591 Prototype->
Mean[0], Prototype->
Mean[1]);
628 BOOL8 KeepInsigProtos,
638 pProtoList = ProtoList;
654 for (i=0; i < N; i++)
659 for (i=0; i < N; i++)
668 for (i=0; i < N; i++)
677 for (i=0; i < N; i++)
685 NewProtoList =
push_last(NewProtoList, NewProto);
689 return (NewProtoList);
699 if (strcmp (MergeClass->
Label, Label) == 0)
712 strcpy (MergeClass->
Label, Label);
731 LIST nodes = ClassList;
735 free (MergeClass->
Label);
745 LIST LabeledClassList) {
772 for(i=0; i < NumProtos; i++)
776 Values[0] = OldProto->
X;
777 Values[1] = OldProto->
Y;
778 Values[2] = OldProto->
Angle;
780 NewProto->
X = OldProto->
X;
781 NewProto->
Y = OldProto->
Y;
784 NewProto->
A = Values[0];
785 NewProto->
B = Values[1];
786 NewProto->
C = Values[2];
794 for(i=0; i < NumConfigs; i++)
798 for(j=0; j < NumWords; j++)
799 NewConfig[j] = OldConfig[j];
803 return float_classes;
814 Slope = tan (Values [2] * 2 *
PI);
815 Intercept = Values [1] - Slope * Values [0];
816 Normalizer = 1 / sqrt (Slope * Slope + 1.0);
818 Values [0] = Slope * Normalizer;
819 Values [1] = - Normalizer;
820 Values [2] = Intercept * Normalizer;
829 LIST nodes = CharList;
852 LabeledProtoList->
List =
push(LabeledProtoList->
List, Proto);
854 *NormProtoList =
push(*NormProtoList, LabeledProtoList);
859 BOOL8 CountInsigProtos) {
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
void FreeLabeledList(LABELEDLIST LabeledList)
FEATURE_DEFS_STRUCT feature_defs
void FreeNormProtoList(LIST CharList)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
bool DeSerialize(bool swap, FILE *fp)
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
bool TESS_API contains_unichar(const char *const unichar_repr) const
bool AddSpacingInfo(const char *filename)
LABELEDLIST FindList(LIST List, char *Label)
void LoadUnicharset(const char *filename)
MERGE_CLASS_NODE * MERGE_CLASS
STRING_PARAM_FLAG(configfile, "", "File to load more configs from")
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples, "Min number of samples per proto as % of total")
UnicityTableEqEq< int > font_set
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
bool save_to_file(const char *const filename) const
void memfree(void *element)
LABELEDLIST NewLabeledList(const char *Label)
void ParseArguments(int *argc, char ***argv)
void Normalize(float *Values)
LIST RemoveInsignificantProtos(LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
BIT_VECTOR NewBitVector(int NumBits)
const int kBoostXYBuckets
ShapeTable * LoadShapeTable(const STRING &file_prefix)
bool LoadFontInfo(const char *filename)
#define WordsInVectorOfSize(NumBits)
bool Serialize(FILE *fp) const
FEATURE_SET_STRUCT * FEATURE_SET
bool LoadXHeights(const char *filename)
void truncate_at(inT32 index)
const char * string() const
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
void FreeClass(CLASS_TYPE Class)
LIST push_last(LIST list, void *item)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
void CleanUpUnusedData(LIST ProtoList)
bool DeSerialize(bool swap, FILE *fp)
const PARAM_DESC * ParamDesc
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
const char * c_str() const
const char * GetNextFilename(int argc, const char *const *argv)
void FreeTrainingSamples(LIST CharList)
void TESS_API unichar_insert(const char *const unichar_repr)
LIST push(LIST list, void *element)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
static bool TESS_API ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
void SetupFlatShapeTable(ShapeTable *shape_table)
struct LABELEDLISTNODE * LABELEDLIST
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
bool Serialize(FILE *fp) const
#define ProtoIn(Class, Pid)
const int kBoostDirBuckets
void FreeLabeledClassList(LIST ClassList)
void SetFeatureSpace(const IntFeatureSpace &fs)
void LoadPageImages(const char *filename)
void FreeFeatureSet(FEATURE_SET FeatureSet)
MERGE_CLASS NewLabeledClass(const char *Label)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
MERGE_CLASS FindClass(LIST List, const char *Label)
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
void move(UnicityTable< T > *from)
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
const UNICHARSET & unicharset() const
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
void FreeProtoList(LIST *ProtoList)