tesseract: cube/word_unigrams.cpp Source File

Go to the documentation of this file.
 /**********************************************************************
  * File:        word_unigrams.cpp
  * Description: Implementation of the Word Unigrams Class
  * Author:    Ahmad Abdulkader
  * Created:   2008
  *
  * (C) Copyright 2008, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/
 
 #include <math.h>
 #include <string>
 #include <vector>
 #include <algorithm>
 
 #include "const.h"
 #include "cube_utils.h"
 #include "ndminx.h"
 #include "word_unigrams.h"
 
 namespace tesseract {
 
 WordUnigrams::WordUnigrams() {
   costs_ = NULL;
   words_ = NULL;
   word_cnt_ = 0;
 }
 
 WordUnigrams::~WordUnigrams() {
   if (words_ != NULL) {
     if (words_[0] != NULL) {
       delete []words_[0];
     }
 
     delete []words_;
     words_ = NULL;
   }
 
   if (costs_ != NULL) {
     delete []costs_;
   }
 }
 
 WordUnigrams *WordUnigrams::Create(const string &data_file_path,
                                    const string &lang) {
   string file_name;
   string str;
 
   file_name = data_file_path + lang;
   file_name += ".cube.word-freq";
 
   // load the string into memory
   if (CubeUtils::ReadFileToString(file_name, &str) == false) {
     return NULL;
   }
 
   // split into lines
   vector<string> str_vec;
   CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
   if (str_vec.size() < 2) {
     return NULL;
   }
 
   // allocate memory
   WordUnigrams *word_unigrams_obj = new WordUnigrams();
 
   int full_len = str.length();
   int word_cnt = str_vec.size() / 2;
   word_unigrams_obj->words_ = new char*[word_cnt];
   word_unigrams_obj->costs_ = new int[word_cnt];
 
   word_unigrams_obj->words_[0] = new char[full_len];
 
   // construct sorted list of words and costs
   word_unigrams_obj->word_cnt_ = 0;
   char *char_buff = word_unigrams_obj->words_[0];
   word_cnt = 0;
   int max_cost = 0;
 
   for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
     word_unigrams_obj->words_[word_cnt] = char_buff;
 
     strcpy(char_buff, str_vec[wrd].c_str());
     char_buff += (str_vec[wrd].length() + 1);
 
     if (sscanf(str_vec[wrd + 1].c_str(), "%d",
                word_unigrams_obj->costs_ + word_cnt) != 1) {
       fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
               "word unigram data.\n");
       delete word_unigrams_obj;
       return NULL;
     }
     // update max cost
     max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
     word_cnt++;
   }
   word_unigrams_obj->word_cnt_ = word_cnt;
 
   // compute the not-in-list-cost by assuming that a word not in the list
   // [ahmadab]: This can be computed as follows:
   // - Given that the distribution of words follow Zipf's law:
   //   (F = K / (rank ^ S)), where s is slightly > 1.0
   // - Number of words in the list is N
   // - The mean frequency of a word that did not appear in the list is the
   //   area under the rest of the Zipf's curve divided by 2 (the mean)
   // - The area would be the bound integral from N to infinity =
   //   (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
   // - Given that cost = -LOG(prob), the cost of an unlisted word would be
   //   = max_cost + 2*LOG(N)
   word_unigrams_obj->not_in_list_cost_ = max_cost +
       (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
   // success
   return word_unigrams_obj;
 }
 
 int WordUnigrams::Cost(const char_32 *key_str32,
                        LangModel *lang_mod,
                        CharSet *char_set) const {
   if (!key_str32)
     return 0;
   // convert string to UTF8 to split into space-separated words
   string key_str;
   CubeUtils::UTF32ToUTF8(key_str32, &key_str);
   vector<string> words;
   CubeUtils::SplitStringUsing(key_str, " \t", &words);
 
   // no words => no cost
   if (words.empty()) {
     return 0;
   }
 
   // aggregate the costs of all the words
   int cost = 0;
   for (int word_idx = 0; word_idx < words.size(); word_idx++) {
     // convert each word back to UTF32 for analyzing case and punctuation
     string_32 str32;
     CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
     int len = CubeUtils::StrLen(str32.c_str());
 
     // strip all trailing punctuation
     string clean_str;
     int clean_len = len;
     bool trunc = false;
     while (clean_len > 0 &&
            lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
       --clean_len;
       trunc = true;
     }
 
     // If either the original string was not truncated (no trailing
     // punctuation) or the entire string was removed (all characters
     // are trailing punctuation), evaluate original word as is;
     // otherwise, copy all but the trailing punctuation characters
     char_32 *clean_str32 = NULL;
     if (clean_len == 0 || !trunc) {
       clean_str32 = CubeUtils::StrDup(str32.c_str());
     } else {
       clean_str32 = new char_32[clean_len + 1];
       for (int i = 0; i < clean_len; ++i) {
         clean_str32[i] = str32[i];
       }
       clean_str32[clean_len] = '\0';
     }
     ASSERT_HOST(clean_str32 != NULL);
 
     string str8;
     CubeUtils::UTF32ToUTF8(clean_str32, &str8);
     int word_cost = CostInternal(str8.c_str());
 
     // if case invariant, get costs of all-upper-case and all-lower-case
     // versions and return the min cost
     if (clean_len >= kMinLengthNumOrCaseInvariant &&
         CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
       char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
       if (lower_32) {
         string lower_8;
         CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
         word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
         delete [] lower_32;
       }
       char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
       if (upper_32) {
         string upper_8;
         CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
         word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
         delete [] upper_32;
       }
     }
 
     if (clean_len >= kMinLengthNumOrCaseInvariant) {
       // if characters are all numeric, incur 0 word cost
       bool is_numeric = true;
       for (int i = 0; i < clean_len; ++i) {
         if (!lang_mod->IsDigit(clean_str32[i]))
           is_numeric = false;
       }
       if (is_numeric)
         word_cost = 0;
     }
     delete [] clean_str32;
     cost += word_cost;
   }  // word_idx
 
   // return the mean cost
   return static_cast<int>(cost / static_cast<double>(words.size()));
 }
 
 int WordUnigrams::CostInternal(const char *key_str) const {
   if (strlen(key_str) == 0)
     return not_in_list_cost_;
   int hi = word_cnt_ - 1;
   int lo = 0;
   while (lo <= hi) {
     int current = (hi + lo) / 2;
     int comp = strcmp(key_str, words_[current]);
     // a match
     if (comp == 0) {
       return costs_[current];
     }
     if (comp < 0) {
       // go lower
       hi = current - 1;
     } else {
       // go higher
       lo = current + 1;
     }
   }
   return not_in_list_cost_;
 }
 }  // namespace tesseract