tesseract
3.05.02
word_unigrams.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: word_unigrams.h
3
* Description: Declaration of the Word Unigrams Class
4
* Author: Ahmad Abdulkader
5
* Created: 2008
6
*
7
* (C) Copyright 2008, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
// The WordUnigram class holds the unigrams of the most frequent set of words
21
// in a language. It is an optional component of the Cube OCR engine. If
22
// present, the unigram cost of a word is aggregated with the other costs
23
// (Recognition, Language Model, Size) to compute a cost for a word.
24
// The word list is assumed to be sorted in lexicographic order.
25
26
#ifndef WORD_UNIGRAMS_H
27
#define WORD_UNIGRAMS_H
28
29
#include <string>
30
#include "
char_set.h
"
31
#include "
lang_model.h
"
32
33
namespace
tesseract
{
34
class
WordUnigrams
{
35
public
:
36
WordUnigrams
();
37
~WordUnigrams
();
38
// Load the word-list and unigrams from file and create an object
39
// The word list is assumed to be sorted
40
static
WordUnigrams
*
Create
(
const
string
&data_file_path,
41
const
string
&
lang
);
42
// Compute the unigram cost of a UTF-32 string. Splits into
43
// space-separated tokens, strips trailing punctuation from each
44
// token, evaluates case properties, and calls internal Cost()
45
// function on UTF-8 version. To avoid unnecessarily penalizing
46
// all-one-case words or capitalized words (first-letter
47
// upper-case and remaining letters lower-case) when not all
48
// versions of the word appear in the <lang>.cube.word-freq file, a
49
// case-invariant cost is computed in those cases, assuming the word
50
// meets a minimum length.
51
int
Cost
(
const
char_32
*str32,
LangModel
*lang_mod,
52
CharSet
*char_set)
const
;
53
protected
:
54
// Compute the word unigram cost of a UTF-8 string with binary
55
// search of sorted words_ array.
56
int
CostInternal
(
const
char
*str)
const
;
57
private
:
58
// Only words this length or greater qualify for all-numeric or
59
// case-invariant word unigram cost.
60
static
const
int
kMinLengthNumOrCaseInvariant = 4;
61
62
int
word_cnt_;
63
char
**words_;
64
int
*costs_;
65
int
not_in_list_cost_;
66
};
67
}
68
69
#endif // WORD_UNIGRAMS_H
tesseract::CharSet
Definition:
char_set.h:42
tesseract::WordUnigrams::~WordUnigrams
~WordUnigrams()
Definition:
word_unigrams.cpp:38
lang_model.h
tesseract::WordUnigrams::CostInternal
int CostInternal(const char *str) const
Definition:
word_unigrams.cpp:230
tesseract-c_api-demo.lang
string lang
Definition:
tesseract-c_api-demo.py:28
tesseract::WordUnigrams::Create
static WordUnigrams * Create(const string &data_file_path, const string &lang)
Definition:
word_unigrams.cpp:57
char_set.h
tesseract::WordUnigrams
Definition:
word_unigrams.h:34
tesseract
Definition:
baseapi.cpp:81
tesseract::WordUnigrams::WordUnigrams
WordUnigrams()
Definition:
word_unigrams.cpp:32
tesseract::char_32
signed int char_32
Definition:
string_32.h:40
tesseract::LangModel
Definition:
lang_model.h:34
tesseract::WordUnigrams::Cost
int Cost(const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const
Definition:
word_unigrams.cpp:135
cube
word_unigrams.h
Generated on Mon Oct 29 2018 11:27:49 for tesseract by
1.8.14