33 #include "unicode/uchar.h" 34 #include "unicode/uscript.h" 42 for (
int unichar_id = 0; unichar_id < unicharset->
size(); ++unichar_id) {
44 const char* unichar_str = unicharset->
id_to_unichar(unichar_id);
58 bool unichar_isalpha =
false;
59 bool unichar_islower =
false;
60 bool unichar_isupper =
false;
61 bool unichar_isdigit =
false;
62 bool unichar_ispunct =
false;
64 for (
int i = 0; i < uni_vector.
size(); ++i) {
65 if (u_isalpha(uni_vector[i]))
66 unichar_isalpha =
true;
67 if (u_islower(uni_vector[i]))
68 unichar_islower =
true;
69 if (u_isupper(uni_vector[i]))
70 unichar_isupper =
true;
71 if (u_isdigit(uni_vector[i]))
72 unichar_isdigit =
true;
73 if (u_ispunct(uni_vector[i]))
74 unichar_ispunct =
true;
77 unicharset->
set_isalpha(unichar_id, unichar_isalpha);
78 unicharset->
set_islower(unichar_id, unichar_islower);
79 unicharset->
set_isupper(unichar_id, unichar_isupper);
80 unicharset->
set_isdigit(unichar_id, unichar_isdigit);
84 unicharset->
set_script(unichar_id, uscript_getName(
85 uscript_getScript(uni_vector[0], err)));
87 const int num_code_points = uni_vector.
size();
90 if (unichar_islower || unichar_isupper) {
92 for (
int i = 0; i < num_code_points; ++i) {
97 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
98 u_tolower(uni_vector[i]);
104 if (other_case_id != INVALID_UNICHAR_ID) {
107 tprintf(
"Other case %s of %s is not in unicharset\n",
108 other_case_uch.
c_str(), unichar_str);
114 for (
int i = 0; i < num_code_points; ++i) {
115 mirrors[i] = u_charMirror(uni_vector[i]);
118 static_cast<UNICHARSET::Direction>(
119 u_charDirection(uni_vector[i])));
125 if (mirror_uch_id != INVALID_UNICHAR_ID) {
126 unicharset->
set_mirror(unichar_id, mirror_uch_id);
127 }
else if (report_errors) {
128 tprintf(
"Mirror %s of %s is not in unicharset\n",
129 mirror_uch.
c_str(), unichar_str);
134 if (unichar_id != 0 && normed_str.
length() > 0) {
137 unicharset->
set_normed(unichar_id, unichar_str);
150 const string& input_unicharset_file,
151 const string& output_unicharset_file,
152 const string& output_xheights_file) {
157 tprintf(
"Loaded unicharset of size %d from file %s\n", unicharset.
size(),
158 input_unicharset_file.c_str());
161 tprintf(
"Setting unichar properties\n");
166 string filename = script_dir +
"/" +
175 string script_heights;
177 xheights_str += script_heights;
179 if (!output_xheights_file.empty())
183 tprintf(
"Warning: properties incomplete for index %d = %s\n",
189 tprintf(
"Writing unicharset to file %s\n", output_unicharset_file.c_str());
190 unicharset.
save_to_file(output_unicharset_file.c_str());
void SetPropertiesFromOther(const UNICHARSET &src)
void SetPropertiesForInputFile(const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file)
void set_islower(UNICHAR_ID unichar_id, bool value)
int get_script_table_size() const
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
void set_normed(UNICHAR_ID unichar_id, const char *normed)
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
const char * get_script_from_script_id(int id) const
bool save_to_file(const char *const filename) const
STRING NormalizeUTF8String(bool decompose, const char *str8)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void set_isdigit(UNICHAR_ID unichar_id, bool value)
void set_isupper(UNICHAR_ID unichar_id, bool value)
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
static TESS_API const char * kCustomLigatures[][2]
const char * c_str() const
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
bool load_from_file(const char *const filename, bool skip_fragments)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
static bool ReadFileToString(const string &filename, string *out)
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
static void WriteStringToFileOrDie(const string &str, const string &filename)
void set_script(UNICHAR_ID unichar_id, const char *value)