tesseract  3.05.02
word_unigrams.h
Go to the documentation of this file.
1  /**********************************************************************
2  * File: word_unigrams.h
3  * Description: Declaration of the Word Unigrams Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The WordUnigram class holds the unigrams of the most frequent set of words
21 // in a language. It is an optional component of the Cube OCR engine. If
22 // present, the unigram cost of a word is aggregated with the other costs
23 // (Recognition, Language Model, Size) to compute a cost for a word.
24 // The word list is assumed to be sorted in lexicographic order.
25 
26 #ifndef WORD_UNIGRAMS_H
27 #define WORD_UNIGRAMS_H
28 
29 #include <string>
30 #include "char_set.h"
31 #include "lang_model.h"
32 
33 namespace tesseract {
34 class WordUnigrams {
35  public:
36  WordUnigrams();
37  ~WordUnigrams();
38  // Load the word-list and unigrams from file and create an object
39  // The word list is assumed to be sorted
40  static WordUnigrams *Create(const string &data_file_path,
41  const string &lang);
42  // Compute the unigram cost of a UTF-32 string. Splits into
43  // space-separated tokens, strips trailing punctuation from each
44  // token, evaluates case properties, and calls internal Cost()
45  // function on UTF-8 version. To avoid unnecessarily penalizing
46  // all-one-case words or capitalized words (first-letter
47  // upper-case and remaining letters lower-case) when not all
48  // versions of the word appear in the <lang>.cube.word-freq file, a
49  // case-invariant cost is computed in those cases, assuming the word
50  // meets a minimum length.
51  int Cost(const char_32 *str32, LangModel *lang_mod,
52  CharSet *char_set) const;
53  protected:
54  // Compute the word unigram cost of a UTF-8 string with binary
55  // search of sorted words_ array.
56  int CostInternal(const char *str) const;
57  private:
58  // Only words this length or greater qualify for all-numeric or
59  // case-invariant word unigram cost.
60  static const int kMinLengthNumOrCaseInvariant = 4;
61 
62  int word_cnt_;
63  char **words_;
64  int *costs_;
65  int not_in_list_cost_;
66 };
67 }
68 
69 #endif // WORD_UNIGRAMS_H
int CostInternal(const char *str) const
static WordUnigrams * Create(const string &data_file_path, const string &lang)
signed int char_32
Definition: string_32.h:40
int Cost(const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const