tesseract  3.05.02
tess_lang_model.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tess_lang_model.h
3  * Description: Declaration of the Tesseract Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESS_LANG_MODEL_H
21 #define TESS_LANG_MODEL_H
22 
23 #include <string>
24 
25 #include "char_altlist.h"
26 #include "cube_reco_context.h"
27 #include "cube_tuning_params.h"
28 #include "dict.h"
29 #include "lang_model.h"
30 #include "tessdatamanager.h"
31 #include "tess_lang_mod_edge.h"
32 
33 namespace tesseract {
34 
35 const int kStateCnt = 4;
36 const int kNumLiteralCnt = 5;
37 
38 class TessLangModel : public LangModel {
39  public:
40  TessLangModel(const string &lm_params,
41  const string &data_file_path,
42  bool load_system_dawg,
43  TessdataManager *tessdata_manager,
44  CubeRecoContext *cntxt);
46  if (word_dawgs_ != NULL) {
47  word_dawgs_->delete_data_pointers();
48  delete word_dawgs_;
49  }
50  }
51 
52  // returns a pointer to the root of the language model
53  inline TessLangModEdge *Root() {
54  return NULL;
55  }
56 
57  // The general fan-out generation function. Returns the list of edges
58  // fanning-out of the specified edge and their count. If an AltList is
59  // specified, only the class-ids with a minimum cost are considered
60  LangModEdge **GetEdges(CharAltList *alt_list,
61  LangModEdge *edge,
62  int *edge_cnt);
63  // Determines if a sequence of 32-bit chars is valid in this language model
64  // starting from the root. If the eow_flag is ON, also checks for
65  // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
66  // edge
67  bool IsValidSequence(const char_32 *sequence, bool eow_flag,
68  LangModEdge **final_edge = NULL);
69  bool IsLeadingPunc(char_32 ch);
70  bool IsTrailingPunc(char_32 ch);
71  bool IsDigit(char_32 ch);
72 
73  void RemoveInvalidCharacters(string *lm_str);
74  private:
75  // static LM state machines
76  static const Dawg *ood_dawg_;
77  static const Dawg *number_dawg_;
78  static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
79  static const int num_max_repeat_[kStateCnt];
80  // word_dawgs_ should only be loaded if cube has its own version of the
81  // unicharset (different from the one used by tesseract) and therefore
82  // can not use the dawgs loaded for tesseract (since the unichar ids
83  // encoded in the dawgs differ).
84  DawgVector *word_dawgs_;
85 
86  static int max_edge_;
87  static int max_ood_shape_cost_;
88 
89  // remaining language model elements needed by cube. These get loaded from
90  // the .lm file
91  string lead_punc_;
92  string trail_punc_;
93  string num_lead_punc_;
94  string num_trail_punc_;
95  string operators_;
96  string digits_;
97  string alphas_;
98  // String of characters in RHS of each line of <lang>.cube.lm
99  // Each element is hard-coded to correspond to a specific token type
100  // (see LoadLangModelElements)
101  string *literal_str_[kNumLiteralCnt];
102  // Recognition context needed to access language properties
103  // (case, cursive,..)
104  CubeRecoContext *cntxt_;
105  bool has_case_;
106 
107  // computes and returns the edges that fan out of an edge ref
108  int FanOut(CharAltList *alt_list,
109  const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
110  const char_32 *str, bool root_flag, LangModEdge **edge_array);
111  // generate edges from an NULL terminated string
112  // (used for punctuation, operators and digits)
113  int Edges(const char *strng, const Dawg *dawg,
114  EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
115  LangModEdge **edge_array);
116  // Generate the edges fanning-out from an edge in the number state machine
117  int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
118  // Generate OOD edges
119  int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
120  EDGE_REF edge_ref_mask, LangModEdge **edge_array);
121  // Cleanup an edge array
122  void FreeEdges(int edge_cnt, LangModEdge **edge_array);
123  // Determines if a sequence of 32-bit chars is valid in this language model
124  // starting from the specified edge. If the eow_flag is ON, also checks for
125  // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
126  // edge
127  bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
128  bool eow_flag, LangModEdge **final_edge);
129  // Parse language model elements from the given string, which should
130  // have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext
131  bool LoadLangModelElements(const string &lm_params);
132 
133  // Returns the number of word Dawgs in the language model.
134  int NumDawgs() const;
135 
136  // Returns the dawgs with the given index from either the dawgs
137  // stored by the Tesseract object, or the word_dawgs_.
138  const Dawg *GetDawg(int index) const;
139 };
140 } // tesseract
141 
142 #endif // TESS_LANG_MODEL_H
bool IsTrailingPunc(char_32 ch)
inT64 EDGE_REF
Definition: dawg.h:54
void RemoveInvalidCharacters(string *lm_str)
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
bool IsLeadingPunc(char_32 ch)
TessLangModel(const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
void delete_data_pointers()
const int kNumLiteralCnt
signed int char_32
Definition: string_32.h:40
TessLangModEdge * Root()
const int kStateCnt