29 #define strtok_r strtok_s 54 bool use_ambigs_for_adaption) {
55 for (
int i = 0; i < unicharset.
size(); ++i) {
58 one_to_one_definite_ambigs_.push_back(NULL);
59 if (use_ambigs_for_adaption) {
60 ambigs_for_adaption_.push_back(NULL);
61 reverse_ambigs_for_adaption_.push_back(NULL);
77 bool use_ambigs_for_adaption,
81 if (debug_level)
tprintf(
"Reading ambiguities\n");
83 int test_ambig_part_size;
84 int replacement_ambig_part_size;
88 char *buffer =
new char[kBufferSize];
99 version =
static_cast<int>(strtol(buffer+1, NULL, 10));
104 while (ambig_file->
FGets(buffer, kBufferSize) != NULL) {
106 if (debug_level > 2)
tprintf(
"read line %s\n", buffer);
108 if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
109 buffer, &test_ambig_part_size, test_unichar_ids,
110 &replacement_ambig_part_size,
111 replacement_string, &type))
continue;
114 if (!InsertIntoTable((type ==
REPLACE_AMBIG) ? replace_ambigs_
116 test_ambig_part_size, test_unichar_ids,
117 replacement_ambig_part_size, replacement_string, type,
118 ambig_spec, unicharset))
122 if (test_ambig_part_size == 1 &&
124 if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
125 one_to_one_definite_ambigs_[test_unichar_ids[0]] =
new UnicharIdVector();
127 one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
131 if (use_ambigs_for_adaption) {
135 if (unicharset->
encode_string(replacement_string,
true, &encoding,
137 for (i = 0; i < test_ambig_part_size; ++i) {
138 if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
141 adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
142 for (
int r = 0; r < encoding.
size(); ++r) {
147 for (j = 0; j < adaption_ambigs_entry->
size() &&
148 (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
149 if (j < adaption_ambigs_entry->size()) {
150 if ((*adaption_ambigs_entry)[j] != id_to_insert) {
151 adaption_ambigs_entry->
insert(id_to_insert, j);
154 adaption_ambigs_entry->
push_back(id_to_insert);
164 if (use_ambigs_for_adaption) {
165 for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
166 adaption_ambigs_entry = ambigs_for_adaption_[i];
167 if (adaption_ambigs_entry == NULL)
continue;
168 for (j = 0; j < adaption_ambigs_entry->
size(); ++j) {
169 UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
170 if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
173 reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
179 if (debug_level > 1) {
180 for (
int tbl = 0; tbl < 2; ++tbl) {
182 (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
183 for (i = 0; i < print_table.
size(); ++i) {
184 AmbigSpec_LIST *lst = print_table[i];
185 if (lst == NULL)
continue;
187 tprintf(
"%s Ambiguities for %s:\n",
188 (tbl == 0) ?
"Replaceable" :
"Dangerous",
191 AmbigSpec_IT lst_it(lst);
192 for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
201 if (use_ambigs_for_adaption) {
202 for (
int vec_id = 0; vec_id < 2; ++vec_id) {
204 ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
205 for (i = 0; i < vec.
size(); ++i) {
206 adaption_ambigs_entry = vec[i];
207 if (adaption_ambigs_entry != NULL) {
208 tprintf(
"%sAmbigs for adaption for %s:\n",
209 (vec_id == 0) ?
"" :
"Reverse ",
211 for (j = 0; j < adaption_ambigs_entry->
size(); ++j) {
213 (*adaption_ambigs_entry)[j]).
string());
223 bool UnicharAmbigs::ParseAmbiguityLine(
224 int line_num,
int version,
int debug_level,
const UNICHARSET &unicharset,
225 char *buffer,
int *test_ambig_part_size,
UNICHAR_ID *test_unichar_ids,
226 int *replacement_ambig_part_size,
char *replacement_string,
int *type) {
231 input.split(
' ', &fields);
232 if (fields.
size() != 3) {
233 if (debug_level)
tprintf(kIllegalMsg, line_num);
238 if (!unicharset.
encode_string(fields[0].string(),
true, &unichars, NULL,
242 *test_ambig_part_size = unichars.
size();
245 tprintf(
"Too many unichars in ambiguity on line %d\n", line_num);
249 for (
int i = 0; i < unichars.
size(); ++i)
250 test_unichar_ids[i] = unichars[i];
251 test_unichar_ids[unichars.
size()] = INVALID_UNICHAR_ID;
253 if (!unicharset.
encode_string(fields[1].string(),
true, &unichars, NULL,
257 *replacement_ambig_part_size = unichars.
size();
260 tprintf(
"Too many unichars in ambiguity on line %d\n", line_num);
263 if (sscanf(fields[2].
string(),
"%d", type) != 1) {
264 if (debug_level)
tprintf(kIllegalMsg, line_num);
273 if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
274 !sscanf(token,
"%d", test_ambig_part_size) ||
275 *test_ambig_part_size <= 0) {
276 if (debug_level)
tprintf(kIllegalMsg, line_num);
281 tprintf(
"Too many unichars in ambiguity on line %d\n", line_num);
284 for (i = 0; i < *test_ambig_part_size; ++i) {
285 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)))
break;
287 if (debug_level)
tprintf(kIllegalUnicharMsg, token);
292 test_unichar_ids[i] = INVALID_UNICHAR_ID;
294 if (i != *test_ambig_part_size ||
295 !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
296 !sscanf(token,
"%d", replacement_ambig_part_size) ||
297 *replacement_ambig_part_size <= 0) {
298 if (debug_level)
tprintf(kIllegalMsg, line_num);
303 tprintf(
"Too many unichars in ambiguity on line %d\n", line_num);
306 replacement_string[0] =
'\0';
307 for (i = 0; i < *replacement_ambig_part_size; ++i) {
308 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)))
break;
309 strcat(replacement_string, token);
311 if (debug_level)
tprintf(kIllegalUnicharMsg, token);
315 if (i != *replacement_ambig_part_size) {
316 if (debug_level)
tprintf(kIllegalMsg, line_num);
329 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
330 !sscanf(token,
"%d", type)) {
331 if (debug_level)
tprintf(kIllegalMsg, line_num);
338 bool UnicharAmbigs::InsertIntoTable(
340 UNICHAR_ID *test_unichar_ids,
int replacement_ambig_part_size,
341 const char *replacement_string,
int type,
342 AmbigSpec *ambig_spec,
UNICHARSET *unicharset) {
343 ambig_spec->type =
static_cast<AmbigType>(type);
344 if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&
345 unicharset->
to_lower(test_unichar_ids[0]) ==
350 ambig_spec->wrong_ngram_size =
365 ambig_spec->correct_ngram_id =
367 if (replacement_ambig_part_size > 1) {
368 unicharset->
set_isngram(ambig_spec->correct_ngram_id,
true);
372 for (i = 0; i < test_ambig_part_size; ++i) {
374 if (test_ambig_part_size == 1) {
375 unichar_id = ambig_spec->correct_ngram_id;
378 replacement_string, i, test_ambig_part_size,
false);
382 ambig_spec->correct_fragments[i] = unichar_id;
384 ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
388 if (table[test_unichar_ids[0]] == NULL) {
389 table[test_unichar_ids[0]] =
new AmbigSpec_LIST();
391 if (table[test_unichar_ids[0]]->add_sorted(
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
bool TESS_API contains_unichar(const char *const unichar_repr) const
void insert(T t, int index)
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
bool Open(const STRING &filename, FileReader reader)
const int ksizeofUniversalAmbigsFile
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
GenericVector< UNICHAR_ID > UnicharIdVector
const char kUniversalAmbigsFile[]
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
char * FGets(char *buffer, int buffer_size)
const char * string() const
const int kMaxAmbigStringSize
UNICHAR_ID correct_ngram_id
STRING debug_str(UNICHAR_ID id) const
static int compare_ambig_specs(const void *spec1, const void *spec2)
void TESS_API unichar_insert(const char *const unichar_repr)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
void set_isngram(UNICHAR_ID unichar_id, bool value)
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
void chomp_string(char *str)
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const