38 cube_cntxt_ = cube_cntxt;
43 if (combiner_net_ != NULL) {
54 string net_file_name = data_path + cube_cntxt_->
Lang() +
58 FILE *fp = fopen(net_file_name.c_str(),
"rb");
66 if (combiner_net_ == NULL) {
67 tprintf(
"Could not read combiner net file %s", net_file_name.c_str());
69 }
else if (combiner_net_->
out_cnt() != 2) {
70 tprintf(
"Invalid combiner net file %s! Output count != 2\n",
71 net_file_name.c_str());
81 string TesseractCubeCombiner::NormalizeString(
const string &str,
89 for (
int idx = 0; idx < str32.length(); idx++) {
91 if (!remove_punc || iswpunct(str32[idx]) == 0) {
94 if (norm_case && iswalpha(norm_char)) {
95 norm_char = towlower(norm_char);
97 new_str32.push_back(norm_char);
107 int TesseractCubeCombiner::CompareStrings(
const string &str1,
111 if (!ignore_punc && !ignore_case) {
112 return str1.compare(str2);
114 string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
115 string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
116 return norm_str1.compare(norm_str2);
132 vector<double> *features,
136 if (cube_alt_list == NULL || cube_alt_list->
AltCount() <= 0)
140 char_32 *cube_best_str32 = cube_alt_list->
Alt(0);
143 string cube_best_str;
144 int cube_best_cost = cube_alt_list->
AltCost(0);
145 int cube_best_bigram_cost = 0;
146 bool cube_best_bigram_cost_valid =
true;
148 cube_best_bigram_cost = cube_cntxt_->
Bigrams()->
151 cube_best_bigram_cost_valid =
false;
159 *agreement = (tess_str.compare(cube_best_str) == 0);
163 string cube_next_best_str;
165 if (cube_alt_list->
AltCount() > 1) {
166 cube_next_best_str32 = cube_alt_list->
Alt(1);
167 if (cube_next_best_str32 == NULL ||
171 cube_next_best_cost = cube_alt_list->
AltCost(1);
176 for (tess_rank = 0; tess_rank < cube_alt_list->
AltCount(); tess_rank++) {
179 if (alt_str == tess_str)
185 int tess_cost = cube_obj->
WordCost(tess_str.c_str());
187 int tess_bigram_cost = 0;
188 int tess_bigram_cost_valid =
true;
190 tess_bigram_cost = cube_cntxt_->
Bigrams()->
193 tess_bigram_cost_valid =
false;
196 features->push_back(tess_confidence);
198 features->push_back(tess_cost);
200 features->push_back(tess_rank);
202 features->push_back(tess_str.length());
204 features->push_back(
ValidWord(tess_str));
205 if (tess_bigram_cost_valid) {
207 features->push_back(tess_bigram_cost);
210 features->push_back(cube_best_cost);
212 features->push_back(cube_next_best_cost);
214 features->push_back(cube_best_str.length());
216 features->push_back(
ValidWord(cube_best_str));
217 if (cube_best_bigram_cost_valid) {
219 features->push_back(cube_best_bigram_cost);
222 int compare_nocase_punc = CompareStrings(cube_best_str,
223 tess_str,
false,
true);
224 features->push_back(compare_nocase_punc == 0);
226 int compare_case_nopunc = CompareStrings(cube_best_str,
227 tess_str,
true,
false);
228 features->push_back(compare_case_nopunc == 0);
230 int compare_nocase_nopunc = CompareStrings(cube_best_str,
231 tess_str,
true,
true);
232 features->push_back(compare_nocase_nopunc == 0);
244 if (combiner_net_ == NULL || cube_obj == NULL) {
245 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): " 246 "Cube objects not initialized; defaulting to Tesseract\n");
253 if (cube_alt_list == NULL)
255 if (cube_alt_list == NULL || cube_alt_list->
AltCount() <= 0) {
256 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): " 257 "Cube returned no results; defaulting to Tesseract\n");
273 if (combiner_net_ == NULL || cube_obj == NULL ||
274 cube_alt_list == NULL || cube_alt_list->
AltCount() <= 0) {
275 tprintf(
"Cube WARNING (TesseractCubeCombiner::CombineResults): " 276 "Cube result cannot be retrieved; defaulting to Tesseract\n");
284 int tess_confidence =
MIN(100,
MAX(1, static_cast<int>(
289 vector<double> features;
292 cube_obj, cube_alt_list,
293 &features, &agreement);
294 if (!combiner_success || agreement)
300 if (!combiner_net_->
FeedForward(&features[0], net_out))
static NeuralNet * FromFile(const string file_name)
const STRING & unichar_string() const
bool FeedForward(const Type *inputs, Type *outputs)
TesseractCubeCombiner(CubeRecoContext *cube_cntxt)
int WordCost(const char *str)
char_32 * Alt(int alt_idx)
WERD_CHOICE * best_choice
const char * string() const
virtual ~TesseractCubeCombiner()
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
tesseract::Tesseract * TesseractObject() const
static int StrLen(const char_32 *str)
WordAltList * RecognizeWord(LangModel *lang_mod=NULL)
const string & Lang() const
CharSet * CharacterSet() const
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
basic_string< char_32 > string_32
bool GetDataFilePath(string *path) const
int AltCost(int alt_idx) const
bool ValidWord(const std::string &str)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
bool ComputeCombinerFeatures(const std::string &tess_res, int tess_confidence, CubeObject *cube_obj, WordAltList *cube_alt_list, std::vector< double > *features, bool *agreement)
CharBigrams * Bigrams() const
WordAltList * AlternateList() const