#include <word_unigrams.h>
Definition at line 34 of file word_unigrams.h.
◆ WordUnigrams()
tesseract::WordUnigrams::WordUnigrams |
( |
| ) |
|
◆ ~WordUnigrams()
tesseract::WordUnigrams::~WordUnigrams |
( |
| ) |
|
◆ Cost()
Split input into space-separated tokens, strip trailing punctuation from each, determine case properties, call UTF-8 flavor of cost function on each word, and aggregate all into single mean word cost.
Definition at line 135 of file word_unigrams.cpp.
143 vector<string> words;
153 for (
int word_idx = 0; word_idx < words.size(); word_idx++) {
163 while (clean_len > 0 &&
164 lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
174 if (clean_len == 0 || !trunc) {
177 clean_str32 =
new char_32[clean_len + 1];
178 for (
int i = 0; i < clean_len; ++i) {
179 clean_str32[i] = str32[i];
181 clean_str32[clean_len] =
'\0';
191 if (clean_len >= kMinLengthNumOrCaseInvariant &&
209 if (clean_len >= kMinLengthNumOrCaseInvariant) {
211 bool is_numeric =
true;
212 for (
int i = 0; i < clean_len; ++i) {
213 if (!lang_mod->IsDigit(clean_str32[i]))
219 delete [] clean_str32;
224 return static_cast<int>(cost /
static_cast<double>(words.size()));
int CostInternal(const char *str) const
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
static int StrLen(const char_32 *str)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
basic_string< char_32 > string_32
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
static char_32 * StrDup(const char_32 *str)
◆ CostInternal()
int tesseract::WordUnigrams::CostInternal |
( |
const char * |
key_str | ) |
const |
|
protected |
Search for UTF-8 string using binary search of sorted words_ array.
Definition at line 230 of file word_unigrams.cpp.
231 if (strlen(key_str) == 0)
232 return not_in_list_cost_;
233 int hi = word_cnt_ - 1;
236 int current = (hi + lo) / 2;
237 int comp = strcmp(key_str, words_[current]);
240 return costs_[current];
250 return not_in_list_cost_;
◆ Create()
WordUnigrams * tesseract::WordUnigrams::Create |
( |
const string & |
data_file_path, |
|
|
const string & |
lang |
|
) |
| |
|
static |
Load the word-list and unigrams from file and create an object The word list is assumed to be sorted in lexicographic order.
Definition at line 57 of file word_unigrams.cpp.
62 file_name = data_file_path +
lang;
63 file_name +=
".cube.word-freq";
71 vector<string> str_vec;
73 if (str_vec.size() < 2) {
80 int full_len = str.length();
81 int word_cnt = str_vec.size() / 2;
82 word_unigrams_obj->words_ =
new char*[word_cnt];
83 word_unigrams_obj->costs_ =
new int[word_cnt];
85 word_unigrams_obj->words_[0] =
new char[full_len];
88 word_unigrams_obj->word_cnt_ = 0;
89 char *char_buff = word_unigrams_obj->words_[0];
93 for (
int wrd = 0; wrd < str_vec.size(); wrd += 2) {
94 word_unigrams_obj->words_[word_cnt] = char_buff;
96 strcpy(char_buff, str_vec[wrd].c_str());
97 char_buff += (str_vec[wrd].length() + 1);
99 if (sscanf(str_vec[wrd + 1].c_str(),
"%d",
100 word_unigrams_obj->costs_ + word_cnt) != 1) {
101 fprintf(stderr,
"Cube ERROR (WordUnigrams::Create): error reading " 102 "word unigram data.\n");
103 delete word_unigrams_obj;
107 max_cost =
MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
110 word_unigrams_obj->word_cnt_ = word_cnt;
123 word_unigrams_obj->not_in_list_cost_ = max_cost +
126 return word_unigrams_obj;
static bool ReadFileToString(const string &file_name, string *str)
static int Prob2Cost(double prob_val)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
The documentation for this class was generated from the following files: