tesseract  3.05.02
char_bigrams.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: char_bigrams.h
3  * Description: Declaration of a Character Bigrams Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The CharBigram class represents the interface to the character bigram
21 // table used by Cube
22 // A CharBigram object can be constructed from the Char Bigrams file
23 // Given a sequence of characters, the "Cost" method returns the Char Bigram
24 // cost of the string according to the table
25 
26 #ifndef CHAR_BIGRAMS_H
27 #define CHAR_BIGRAMS_H
28 
29 #include <string>
30 #include "char_set.h"
31 
32 namespace tesseract {
33 
34 // structure representing a single bigram value
35 struct Bigram {
36  int cnt;
37  int cost;
38 };
39 
40 // structure representing the char bigram array of characters
41 // following a specific character
42 struct CharBigram {
43  int total_cnt;
46 };
47 
48 // structure representing the whole bigram table
50  int total_cnt;
54 };
55 
56 class CharBigrams {
57  public:
58  CharBigrams();
59  ~CharBigrams();
60  // Construct the CharBigrams class from a file
61  static CharBigrams *Create(const string &data_file_path,
62  const string &lang);
63  // Top-level function to return the mean character bigram cost of a
64  // sequence of characters. If char_set is not NULL, use
65  // tesseract functions to return a case-invariant cost.
66  // This avoids unnecessarily penalizing all-one-case words or
67  // capitalized words (first-letter upper-case and remaining letters
68  // lower-case).
69  int Cost(const char_32 *str, CharSet *char_set) const;
70 
71  protected:
72  // Returns the character bigram cost of two characters.
73  int PairCost(char_32 ch1, char_32 ch2) const;
74  // Returns the mean character bigram cost of a sequence of
75  // characters. Adds a space at the beginning and end to account for
76  // cost of starting and ending characters.
77  int MeanCostWithSpaces(const char_32 *char_32_ptr) const;
78 
79  private:
80  // Only words this length or greater qualify for case-invariant character
81  // bigram cost.
82  static const int kMinLengthCaseInvariant = 4;
83 
84 
85  CharBigramTable bigram_table_;
86 };
87 }
88 
89 #endif // CHAR_BIGRAMS_H
int MeanCostWithSpaces(const char_32 *char_32_ptr) const
static CharBigrams * Create(const string &data_file_path, const string &lang)
int Cost(const char_32 *str, CharSet *char_set) const
signed int char_32
Definition: string_32.h:40
int PairCost(char_32 ch1, char_32 ch2) const