#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"
Go to the source code of this file.
◆ main()
int main |
( |
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
This program reads in a text file consisting of feature samples from a training page in the following format:
FontName UTF8-char-str xmin ymin xmax ymax page-number
NumberOfFeatureTypes(N)
FeatureTypeName1 NumberOfFeatures(M)
Feature1
...
FeatureM
FeatureTypeName2 NumberOfFeatures(M)
Feature1
...
FeatureM
...
FeatureTypeNameN NumberOfFeatures(M)
Feature1
...
FeatureM
FontName CharName ...
The result of this program is a binary inttemp file used by the OCR engine.
- Parameters
-
argc | number of command line arguments |
argv | array of command line arguments |
- Returns
- none
- Note
- Exceptions: none
-
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
-
History: Mon May 18 1998, Christy Russson, Revistion started.
Definition at line 96 of file unicharset_extractor.cpp.
98 const char* output_directory =
".";
99 STRING unicharset_file_name;
103 setlocale(LC_ALL,
"");
107 printf(
"Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
109 printf(
"Character properties using wctype is enabled\n");
111 printf(
"WARNING: Character properties using wctype is DISABLED\n");
118 while ((option =
tessopt(argc, argv,
"D" )) != EOF) {
128 unicharset_file_name = output_directory;
129 unicharset_file_name +=
"/";
130 unicharset_file_name += kUnicharsetFileName;
134 printf(
"Extracting unicharset from %s\n", argv[
tessoptind]);
136 FILE* box_file = fopen(argv[
tessoptind],
"rb");
137 if (box_file == NULL) {
138 printf(
"Cannot open box file %s\n", argv[
tessoptind]);
145 while (
ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
153 printf(
"Wrote unicharset file %s.\n", unicharset_file_name.
string());
156 printf(
"Cannot save unicharset file %s.\n", unicharset_file_name.
string());
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
bool save_to_file(const char *const filename) const
const char * string() const
void set_properties(UNICHARSET *unicharset, const char *const c_string)
void TESS_API unichar_insert(const char *const unichar_repr)
int tessopt(inT32 argc, char *argv[], const char *arglist)
◆ set_properties()
void set_properties |
( |
UNICHARSET * |
unicharset, |
|
|
const char *const |
c_string |
|
) |
| |
Definition at line 55 of file unicharset_extractor.cpp.
void set_islower(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
void set_isupper(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
static int utf8_step(const char *utf8_str)
◆ wc_to_unichar_id()
Definition at line 43 of file unicharset_extractor.cpp.
45 char *unichar = uch.utf8_str();
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const