42 int TessLangModel::max_edge_ = 4096;
45 const Dawg *TessLangModel::ood_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_OOD);
46 const Dawg *TessLangModel::number_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_NUMBER);
55 const int TessLangModel::num_max_repeat_[
kStateCnt] = {3, 32, 8, 3};
61 const string &data_file_path,
62 bool load_system_dawg,
68 LoadLangModelElements(lm_params);
72 if (load_system_dawg &&
78 cntxt_->
Lang().c_str(),
87 void TessLangModel::FreeEdges(
int edge_cnt,
LangModEdge **edge_array) {
88 if (edge_array != NULL) {
89 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
90 if (edge_array[edge_idx] != NULL) {
91 delete edge_array[edge_idx];
105 LangModEdge **final_edge) {
108 LangModEdge **edge_array =
GetEdges(NULL, edge, &edge_cnt);
111 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
113 if (sequence[0] == edge_array[edge_idx]->EdgeString()[0]) {
115 if (sequence[1] == 0) {
117 if (eow_flag ==
false || edge_array[edge_idx]->IsEOW()) {
118 if (final_edge != NULL) {
119 (*final_edge) = edge_array[edge_idx];
120 edge_array[edge_idx] = NULL;
123 FreeEdges(edge_cnt, edge_array);
129 final_edge) ==
true) {
130 FreeEdges(edge_cnt, edge_array);
137 FreeEdges(edge_cnt, edge_array);
147 if (final_edge != NULL) {
148 (*final_edge) = NULL;
155 return lead_punc_.find(ch) != string::npos;
159 return trail_punc_.find(ch) != string::npos;
163 return digits_.find(ch) != string::npos;
179 if (tess_lm_edge == NULL) {
181 int dawg_cnt = NumDawgs();
183 (*edge_cnt) = dawg_cnt * max_edge_;
186 for (
int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
187 const Dawg *curr_dawg = GetDawg(dawg_idx);
191 (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL,
true,
192 edge_array + (*edge_cnt));
196 (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL,
true,
197 edge_array + (*edge_cnt));
201 (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL,
true,
202 edge_array + (*edge_cnt));
205 for (
int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
206 edge_array[edge_idx]->
SetRoot(
true);
210 (*edge_cnt) = max_edge_;
215 (*edge_cnt) = FanOut(alt_list,
218 tess_lm_edge->
EdgeString(),
false, edge_array);
225 int TessLangModel::Edges(
const char *strng,
const Dawg *dawg,
231 for (edge_idx = 0; strng[edge_idx] != 0; edge_idx++) {
233 if (class_id != INVALID_UNICHAR_ID) {
239 SetEdgeMask(edge_mask);
248 int TessLangModel::OODEdges(CharAltList *alt_list,
EDGE_REF edge_ref,
249 EDGE_REF edge_ref_mask, LangModEdge **edge_array) {
252 for (
int class_id = 0; class_id < class_cnt; class_id++) {
254 if ((alt_list == NULL ||
255 alt_list->ClassCost(class_id) <= max_ood_shape_cost_)) {
257 edge_array[edge_cnt] =
new TessLangModEdge(cntxt_, class_id);
266 int TessLangModel::FanOut(CharAltList *alt_list,
const Dawg *dawg,
268 const char_32 *str,
bool root_flag,
269 LangModEdge **edge_array) {
274 if (dawg == reinterpret_cast<Dawg *>(
DAWG_OOD)) {
276 return OODEdges(alt_list, edge_ref, edge_mask, edge_array);
280 }
else if (dawg == reinterpret_cast<Dawg *>(
DAWG_NUMBER)) {
283 return NumberEdges(edge_ref, edge_array);
291 return Edges(trail_punc_.c_str(), dawg, edge_ref,
296 }
else if (root_flag ==
true || edge_ref == 0) {
307 bool eow_flag = (dawg->end_of_word(edge_ref) != 0);
310 if (eow_flag ==
true) {
313 edge_cnt += Edges(trail_punc_.c_str(), dawg, edge_ref,
316 edge_cnt += Edges(
"-/", dawg, 0, 0, edge_array + edge_cnt);
321 next_node = dawg->next_node(edge_ref);
322 if (next_node == 0 || next_node == NO_EDGE) {
332 edge_array + edge_cnt);
333 int strt_cnt = edge_cnt;
336 for (
int child = 0; child < child_edge_cnt; child++) {
337 reinterpret_cast<TessLangModEdge *
>(edge_array[edge_cnt++])->
338 SetEdgeMask(edge_mask);
342 if (root_flag ==
true) {
343 for (
int child = 0; child < child_edge_cnt; child++) {
344 TessLangModEdge *child_edge =
345 reinterpret_cast<TessLangModEdge *
>(edge_array[strt_cnt + child]);
347 if (has_case_ ==
true) {
348 const char_32 *edge_str = child_edge->EdgeString();
349 if (edge_str != NULL && islower(edge_str[0]) != 0 &&
353 if (class_id != INVALID_UNICHAR_ID) {
355 edge_array[edge_cnt] =
new TessLangModEdge(cntxt_, dawg,
356 child_edge->StartEdge(), child_edge->EndEdge(), class_id);
358 reinterpret_cast<TessLangModEdge *
>(edge_array[edge_cnt])->
359 SetEdgeMask(edge_mask);
371 int TessLangModel::NumberEdges(
EDGE_REF edge_ref, LangModEdge **edge_array) {
392 new_state = num_state_machine_[state][lit];
397 if (new_state == state) {
398 new_repeat_cnt = repeat_cnt + 1;
404 if (new_repeat_cnt > num_max_repeat_[state]) {
412 edge_cnt += Edges(literal_str_[lit]->c_str(), number_dawg_,
413 new_edge_ref, 0, edge_array + edge_cnt);
420 bool TessLangModel::LoadLangModelElements(
const string &lm_params) {
423 vector<string> str_vec;
425 for (
int entry = 0; entry < str_vec.size(); entry++) {
426 vector<string> tokens;
429 if (tokens.size() != 2)
431 if (tokens[0] ==
"LeadPunc") {
432 lead_punc_ = tokens[1];
433 }
else if (tokens[0] ==
"TrailPunc") {
434 trail_punc_ = tokens[1];
435 }
else if (tokens[0] ==
"NumLeadPunc") {
436 num_lead_punc_ = tokens[1];
437 }
else if (tokens[0] ==
"NumTrailPunc") {
438 num_trail_punc_ = tokens[1];
439 }
else if (tokens[0] ==
"Operators") {
440 operators_ = tokens[1];
441 }
else if (tokens[0] ==
"Digits") {
443 }
else if (tokens[0] ==
"Alphas") {
458 literal_str_[0] = &num_lead_punc_;
459 literal_str_[1] = &num_trail_punc_;
460 literal_str_[2] = &digits_;
461 literal_str_[3] = &operators_;
462 literal_str_[4] = &alphas_;
475 for (
int i = 0; i < len; ++i) {
477 if (class_id != INVALID_UNICHAR_ID) {
478 clean_str32[clean_len] = lm_str32[i];
482 clean_str32[clean_len] = 0;
483 if (clean_len < len) {
487 delete [] clean_str32;
490 int TessLangModel::NumDawgs()
const {
491 return (word_dawgs_ != NULL) ?
497 const Dawg *TessLangModel::GetDawg(
int index)
const {
498 if (word_dawgs_ != NULL) {
500 return (*word_dawgs_)[index];
502 ASSERT_HOST(index < cntxt_->TesseractObject()->getDict().NumDawgs());
FILE * GetDataFilePtr() const
bool IsTrailingPunc(char_32 ch)
#define NUMBER_REPEAT_MASK
#define NUMBER_REPEAT_SHIFT
#define TrailingPuncEdgeMask(Cnt)
#define NUMBER_STATE_MASK
#define TrailingPuncCount(edge_mask)
#define IsTrailingPuncEdge(edge_mask)
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
#define LEAD_PUNC_EDGE_REF_MASK
void RemoveInvalidCharacters(string *lm_str)
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
static int Prob2Cost(double prob_val)
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
virtual void SetRoot(bool flag)=0
tesseract::Tesseract * TesseractObject() const
static int StrLen(const char_32 *str)
const Dawg * GetDawg() const
EDGE_REF EdgeMask() const
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
const string & Lang() const
bool IsLeadingPunc(char_32 ch)
TessLangModel(const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
CharSet * CharacterSet() const
const char_32 * EdgeString() const
GenericVector< Dawg * > DawgVector
basic_string< char_32 > string_32
#define NUMBER_LITERAL_SHIFT
bool SeekToStart(TessdataType tessdata_type)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
int ClassID(const char_32 *str) const
#define NUMBER_STATE_SHIFT