tesseract  3.05.02
WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for WERD_CHOICE:
ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
int length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const UNICHAR_IDunichar_ids () const
 
UNICHAR_ID unichar_id (int index) const
 
int state (int index) const
 
tesseract::ScriptPos BlobPosition (int index) const
 
float rating () const
 
float certainty () const
 
float certainty (int index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uinT8 permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (int index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (int index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, int index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uinT8 perm)
 
void set_length (int len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (int reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
 
void set_blob_choice (int index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (int index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (int index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (int *start_core, int *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (int start, int end) const
 
void string_and_lengths (STRING *word_str, STRING *word_lengths_str) const
 
const STRING debug_string () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
const STRINGunichar_string () const
 
const STRINGunichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word)
 
void SetScriptPositions (const tesseract::ScriptPos *positions, int length)
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
int TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uinT8 permuter)
 
static tesseract::ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 271 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 276 of file ratngs.h.

277  : unicharset_(unicharset) { this->init(8); }
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
void init(int reserved)
Definition: ratngs.h:407

◆ WERD_CHOICE() [2/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 278 of file ratngs.h.

279  : unicharset_(unicharset) { this->init(reserved); }
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
void init(int reserved)
Definition: ratngs.h:407

◆ WERD_CHOICE() [3/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uinT8  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 280 of file ratngs.h.

286  : unicharset_(&unicharset) {
287  this->init(src_string, src_lengths, src_rating,
288  src_certainty, src_permuter);
289  }
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
void init(int reserved)
Definition: ratngs.h:407

◆ WERD_CHOICE() [4/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not NULL.

Definition at line 198 of file ratngs.cpp.

200  : unicharset_(&unicharset){
201  GenericVector<UNICHAR_ID> encoding;
202  GenericVector<char> lengths;
203  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
204  lengths.push_back('\0');
205  STRING src_lengths = &lengths[0];
206  this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
207  } else { // There must have been an invalid unichar in the string.
208  this->init(8);
209  this->make_bad();
210  }
211 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
int push_back(T object)
const char * string() const
Definition: strngs.cpp:201
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:441
void init(int reserved)
Definition: ratngs.h:407
Definition: strngs.h:44

◆ WERD_CHOICE() [5/5]

WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 291 of file ratngs.h.

292  : ELIST_LINK(word), unicharset_(word.unicharset_) {
293  this->init(word.length());
294  this->operator=(word);
295  }
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:499
ELIST_LINK()
Definition: elst.h:92
void init(int reserved)
Definition: ratngs.h:407
int length() const
Definition: ratngs.h:301

◆ ~WERD_CHOICE()

WERD_CHOICE::~WERD_CHOICE ( )

WERD_CHOICE::~WERD_CHOICE

Definition at line 254 of file ratngs.cpp.

254  {
255  delete[] unichar_ids_;
256  delete[] script_pos_;
257  delete[] state_;
258  delete[] certainties_;
259 }

Member Function Documentation

◆ adjust_factor()

float WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 304 of file ratngs.h.

304  {
305  return adjust_factor_;
306  }

◆ append_unichar_id()

void WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 446 of file ratngs.cpp.

448  {
449  if (length_ == reserved_) {
450  this->double_the_size();
451  }
453  rating, certainty);
454 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:385
float certainty() const
Definition: ratngs.h:328
float rating() const
Definition: ratngs.h:325
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450

◆ append_unichar_id_space_allocated()

void WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 450 of file ratngs.h.

452  {
453  assert(reserved_ > length_);
454  length_++;
455  this->set_unichar_id(unichar_id, blob_count,
456  rating, certainty, length_-1);
457  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
float certainty() const
Definition: ratngs.h:328
float rating() const
Definition: ratngs.h:325

◆ blob_choices()

BLOB_CHOICE_LIST * WERD_CHOICE::blob_choices ( int  index,
MATRIX ratings 
) const

Definition at line 268 of file ratngs.cpp.

268  {
269  MATRIX_COORD coord = MatrixCoord(index);
270  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
271  if (result == NULL) {
272  result = new BLOB_CHOICE_LIST;
273  ratings->put(coord.col, coord.row, result);
274  }
275  return result;
276 }
T get(ICOORD pos) const
Definition: matrix.h:228
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:280

◆ BlobPosition()

tesseract::ScriptPos WERD_CHOICE::BlobPosition ( int  index) const
inline

Definition at line 320 of file ratngs.h.

320  {
321  if (index < 0 || index >= length_)
322  return tesseract::SP_NORMAL;
323  return script_pos_[index];
324  }

◆ certainty() [1/2]

float WERD_CHOICE::certainty ( ) const
inline

Definition at line 328 of file ratngs.h.

328  {
329  return certainty_;
330  }

◆ certainty() [2/2]

float WERD_CHOICE::certainty ( int  index) const
inline

Definition at line 331 of file ratngs.h.

331  {
332  return certainties_[index];
333  }

◆ contains_unichar_id()

bool WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 304 of file ratngs.cpp.

304  {
305  for (int i = 0; i < length_; ++i) {
306  if (unichar_ids_[i] == unichar_id) {
307  return true;
308  }
309  }
310  return false;
311 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313

◆ dangerous_ambig_found()

bool WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 361 of file ratngs.h.

361  {
362  return dangerous_ambig_found_;
363  }

◆ debug_string()

const STRING WERD_CHOICE::debug_string ( ) const
inline

Definition at line 503 of file ratngs.h.

503  {
504  STRING word_str;
505  for (int i = 0; i < length_; ++i) {
506  word_str += unicharset_->debug_str(unichar_ids_[i]);
507  word_str += " ";
508  }
509  return word_str;
510  }
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
Definition: strngs.h:44

◆ DisplaySegmentation()

void WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 747 of file ratngs.cpp.

747  {
748 #ifndef GRAPHICS_DISABLED
749  // Number of different colors to draw with.
750  const int kNumColors = 6;
751  static ScrollView *segm_window = NULL;
752  // Check the state against the static prev_drawn_state.
753  static GenericVector<int> prev_drawn_state;
754  bool already_done = prev_drawn_state.size() == length_;
755  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
756  for (int i = 0; i < length_; ++i) {
757  if (prev_drawn_state[i] != state_[i]) {
758  already_done = false;
759  }
760  prev_drawn_state[i] = state_[i];
761  }
762  if (already_done || word->blobs.empty()) return;
763 
764  // Create the window if needed.
765  if (segm_window == NULL) {
766  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
767  2000.0, 256.0, true);
768  } else {
769  segm_window->Clear();
770  }
771 
772  TBOX bbox;
773  int blob_index = 0;
774  for (int c = 0; c < length_; ++c) {
775  ScrollView::Color color =
776  static_cast<ScrollView::Color>(c % kNumColors + 3);
777  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
778  TBLOB* blob = word->blobs[blob_index];
779  bbox += blob->bounding_box();
780  blob->plot(segm_window, color, color);
781  }
782  }
783  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
784  bbox.right(), bbox.bottom());
785  segm_window->Update();
786  window_wait(segm_window);
787 #endif
788 }
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
static void Update()
Definition: scrollview.cpp:715
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 bottom() const
Definition: rect.h:61
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:524
inT16 left() const
Definition: rect.h:68
void Clear()
Definition: scrollview.cpp:595
int size() const
Definition: genericvector.h:72
Definition: blobs.h:261
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
bool empty() const
Definition: genericvector.h:84
void init_to_size(int size, T t)
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765

◆ double_the_size()

void WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 385 of file ratngs.h.

385  {
386  if (reserved_ > 0) {
388  reserved_, unichar_ids_);
390  reserved_, script_pos_);
392  reserved_, state_);
394  reserved_, certainties_);
395  reserved_ *= 2;
396  } else {
397  unichar_ids_ = new UNICHAR_ID[1];
398  script_pos_ = new tesseract::ScriptPos[1];
399  state_ = new int[1];
400  certainties_ = new float[1];
401  reserved_ = 1;
402  }
403  }
static T * double_the_size_memcpy(int current_size, T *data)
int UNICHAR_ID
Definition: unichar.h:33

◆ GetNonSuperscriptSpan()

void WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 375 of file ratngs.cpp.

375  {
376  int end = length();
377  while (end > 0 &&
378  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
380  end--;
381  }
382  int start = 0;
383  while (start < end &&
384  unicharset_->get_isdigit(unichar_ids_[start]) &&
386  start++;
387  }
388  *pstart = start;
389  *pend = end;
390 }
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:301
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

◆ GetTopScriptID()

int WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 653 of file ratngs.cpp.

653  {
654  int max_script = unicharset_->get_script_table_size();
655  int *sid = new int[max_script];
656  int x;
657  for (x = 0; x < max_script; x++) sid[x] = 0;
658  for (x = 0; x < length_; ++x) {
659  int script_id = unicharset_->get_script(unichar_id(x));
660  sid[script_id]++;
661  }
662  if (unicharset_->han_sid() != unicharset_->null_sid()) {
663  // Add the Hiragana & Katakana counts to Han and zero them out.
664  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
665  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
666  sid[unicharset_->hiragana_sid()] = 0;
667  }
668  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
669  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
670  sid[unicharset_->katakana_sid()] = 0;
671  }
672  }
673  // Note that high script ID overrides lower one on a tie, thus biasing
674  // towards non-Common script (if sorted that way in unicharset file).
675  int max_sid = 0;
676  for (x = 1; x < max_script; x++)
677  if (sid[x] >= sid[max_sid]) max_sid = x;
678  if (sid[max_sid] < length_ / 2)
679  max_sid = unicharset_->null_sid();
680  delete[] sid;
681  return max_sid;
682 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int get_script_table_size() const
Definition: unicharset.h:797
int han_sid() const
Definition: unicharset.h:836
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
int hiragana_sid() const
Definition: unicharset.h:837
int null_sid() const
Definition: unicharset.h:831
int katakana_sid() const
Definition: unicharset.h:838

◆ has_rtl_unichar_id()

bool WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 409 of file ratngs.cpp.

409  {
410  int i;
411  for (i = 0; i < length_; ++i) {
412  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
413  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
415  return true;
416  }
417  }
418  return false;
419 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:638

◆ init() [1/2]

void WERD_CHOICE::init ( int  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 407 of file ratngs.h.

407  {
408  reserved_ = reserved;
409  if (reserved > 0) {
410  unichar_ids_ = new UNICHAR_ID[reserved];
411  script_pos_ = new tesseract::ScriptPos[reserved];
412  state_ = new int[reserved];
413  certainties_ = new float[reserved];
414  } else {
415  unichar_ids_ = NULL;
416  script_pos_ = NULL;
417  state_ = NULL;
418  certainties_ = NULL;
419  }
420  length_ = 0;
421  adjust_factor_ = 1.0f;
422  rating_ = 0.0;
423  certainty_ = MAX_FLOAT32;
424  min_x_height_ = 0.0f;
425  max_x_height_ = MAX_FLOAT32;
426  permuter_ = NO_PERM;
427  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
428  dangerous_ambig_found_ = false;
429  }
#define MAX_FLOAT32
Definition: host.h:57
int UNICHAR_ID
Definition: unichar.h:33

◆ init() [2/2]

void WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uinT8  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not NULL. src_lengths argument could be NULL, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not NULL. src_lengths argument could be NULL, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 223 of file ratngs.cpp.

227  {
228  int src_string_len = strlen(src_string);
229  if (src_string_len == 0) {
230  this->init(8);
231  } else {
232  this->init(src_lengths ? strlen(src_lengths): src_string_len);
233  length_ = reserved_;
234  int offset = 0;
235  for (int i = 0; i < length_; ++i) {
236  int unichar_length = src_lengths ? src_lengths[i] : 1;
237  unichar_ids_[i] =
238  unicharset_->unichar_to_id(src_string+offset, unichar_length);
239  state_[i] = 1;
240  certainties_[i] = src_certainty;
241  offset += unichar_length;
242  }
243  }
244  adjust_factor_ = 1.0f;
245  rating_ = src_rating;
246  certainty_ = src_certainty;
247  permuter_ = src_permuter;
248  dangerous_ambig_found_ = false;
249 }
void init(int reserved)
Definition: ratngs.h:407
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ length()

int WERD_CHOICE::length ( ) const
inline

Definition at line 301 of file ratngs.h.

301  {
302  return length_;
303  }

◆ make_bad()

void WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 441 of file ratngs.h.

441  {
442  length_ = 0;
443  rating_ = kBadRating;
444  certainty_ = -MAX_FLOAT32;
445  }
#define MAX_FLOAT32
Definition: host.h:57
static const float kBadRating
Definition: ratngs.h:273

◆ MatrixCoord()

MATRIX_COORD WERD_CHOICE::MatrixCoord ( int  index) const

Definition at line 280 of file ratngs.cpp.

280  {
281  int col = 0;
282  for (int i = 0; i < index; ++i)
283  col += state_[i];
284  int row = col + state_[index] - 1;
285  return MATRIX_COORD(col, row);
286 }

◆ max_x_height()

float WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 337 of file ratngs.h.

337  {
338  return max_x_height_;
339  }

◆ min_x_height()

float WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 334 of file ratngs.h.

334  {
335  return min_x_height_;
336  }

◆ operator+=()

WERD_CHOICE & WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 463 of file ratngs.cpp.

463  {
464  ASSERT_HOST(unicharset_ == second.unicharset_);
465  while (reserved_ < length_ + second.length()) {
466  this->double_the_size();
467  }
468  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
469  for (int i = 0; i < second.length(); ++i) {
470  unichar_ids_[length_ + i] = other_unichar_ids[i];
471  state_[length_ + i] = second.state_[i];
472  certainties_[length_ + i] = second.certainties_[i];
473  script_pos_[length_ + i] = second.BlobPosition(i);
474  }
475  length_ += second.length();
476  if (second.adjust_factor_ > adjust_factor_)
477  adjust_factor_ = second.adjust_factor_;
478  rating_ += second.rating(); // add ratings
479  if (second.certainty() < certainty_) // take min
480  certainty_ = second.certainty();
481  if (second.dangerous_ambig_found_)
482  dangerous_ambig_found_ = true;
483  if (permuter_ == NO_PERM) {
484  permuter_ = second.permuter();
485  } else if (second.permuter() != NO_PERM &&
486  second.permuter() != permuter_) {
487  permuter_ = COMPOUND_PERM;
488  }
489  return *this;
490 }
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:310
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:320
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:385
float certainty() const
Definition: ratngs.h:328
uinT8 permuter() const
Definition: ratngs.h:344
int length() const
Definition: ratngs.h:301
float rating() const
Definition: ratngs.h:325
#define ASSERT_HOST(x)
Definition: errcode.h:84
int UNICHAR_ID
Definition: unichar.h:33

◆ operator=()

WERD_CHOICE & WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 499 of file ratngs.cpp.

499  {
500  while (reserved_ < source.length()) {
501  this->double_the_size();
502  }
503 
504  unicharset_ = source.unicharset_;
505  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
506  for (int i = 0; i < source.length(); ++i) {
507  unichar_ids_[i] = other_unichar_ids[i];
508  state_[i] = source.state_[i];
509  certainties_[i] = source.certainties_[i];
510  script_pos_[i] = source.BlobPosition(i);
511  }
512  length_ = source.length();
513  adjust_factor_ = source.adjust_factor_;
514  rating_ = source.rating();
515  certainty_ = source.certainty();
516  min_x_height_ = source.min_x_height();
517  max_x_height_ = source.max_x_height();
518  permuter_ = source.permuter();
519  dangerous_ambig_found_ = source.dangerous_ambig_found_;
520  return *this;
521 }
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:310
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:320
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:385
float certainty() const
Definition: ratngs.h:328
uinT8 permuter() const
Definition: ratngs.h:344
float min_x_height() const
Definition: ratngs.h:334
int length() const
Definition: ratngs.h:301
float rating() const
Definition: ratngs.h:325
float max_x_height() const
Definition: ratngs.h:337
int UNICHAR_ID
Definition: unichar.h:33

◆ permuter()

uinT8 WERD_CHOICE::permuter ( ) const
inline

Definition at line 344 of file ratngs.h.

344  {
345  return permuter_;
346  }

◆ permuter_name() [1/2]

const char * WERD_CHOICE::permuter_name ( uinT8  permuter)
static

Definition at line 174 of file ratngs.cpp.

174  {
175  return kPermuterTypeNames[permuter];
176 }
uinT8 permuter() const
Definition: ratngs.h:344

◆ permuter_name() [2/2]

const char * WERD_CHOICE::permuter_name ( ) const

Definition at line 261 of file ratngs.cpp.

261  {
262  return kPermuterTypeNames[permuter_];
263 }

◆ print() [1/2]

void WERD_CHOICE::print ( ) const
inline

Definition at line 564 of file ratngs.h.

564 { this->print(""); }
void print() const
Definition: ratngs.h:564

◆ print() [2/2]

void WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 710 of file ratngs.cpp.

710  {
711  tprintf("%s : ", msg);
712  for (int i = 0; i < length_; ++i) {
713  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
714  }
715  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
716  rating_, certainty_, adjust_factor_, permuter_,
717  min_x_height_, max_x_height_, dangerous_ambig_found_);
718  tprintf("pos");
719  for (int i = 0; i < length_; ++i) {
720  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
721  }
722  tprintf("\nstr");
723  for (int i = 0; i < length_; ++i) {
724  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
725  }
726  tprintf("\nstate:");
727  for (int i = 0; i < length_; ++i) {
728  tprintf("\t%d ", state_[i]);
729  }
730  tprintf("\nC");
731  for (int i = 0; i < length_; ++i) {
732  tprintf("\t%.3f", certainties_[i]);
733  }
734  tprintf("\n");
735 }
#define tprintf(...)
Definition: tprintf.h:31
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:180

◆ print_state()

void WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 738 of file ratngs.cpp.

738  {
739  tprintf("%s", msg);
740  for (int i = 0; i < length_; ++i)
741  tprintf(" %d", state_[i]);
742  tprintf("\n");
743 }
#define tprintf(...)
Definition: tprintf.h:31

◆ punct_stripped()

void WERD_CHOICE::punct_stripped ( int *  start,
int *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 361 of file ratngs.cpp.

361  {
362  *start = 0;
363  *end = length() - 1;
364  while (*start < length() &&
365  unicharset()->get_ispunctuation(unichar_id(*start))) {
366  (*start)++;
367  }
368  while (*end > -1 &&
369  unicharset()->get_ispunctuation(unichar_id(*end))) {
370  (*end)--;
371  }
372  (*end)++;
373 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
int length() const
Definition: ratngs.h:301

◆ rating()

float WERD_CHOICE::rating ( ) const
inline

Definition at line 325 of file ratngs.h.

325  {
326  return rating_;
327  }

◆ remove_last_unichar_id()

void WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 481 of file ratngs.h.

481 { --length_; }

◆ remove_unichar_id()

void WERD_CHOICE::remove_unichar_id ( int  index)
inline

Definition at line 482 of file ratngs.h.

482  {
483  this->remove_unichar_ids(index, 1);
484  }
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:320

◆ remove_unichar_ids()

void WERD_CHOICE::remove_unichar_ids ( int  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 320 of file ratngs.cpp.

320  {
321  ASSERT_HOST(start >= 0 && start + num <= length_);
322  // Accumulate the states to account for the merged blobs.
323  for (int i = 0; i < num; ++i) {
324  if (start > 0)
325  state_[start - 1] += state_[start + i];
326  else if (start + num < length_)
327  state_[start + num] += state_[start + i];
328  }
329  for (int i = start; i + num < length_; ++i) {
330  unichar_ids_[i] = unichar_ids_[i + num];
331  script_pos_[i] = script_pos_[i + num];
332  state_[i] = state_[i + num];
333  certainties_[i] = certainties_[i + num];
334  }
335  length_ -= num;
336 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ reverse_and_mirror_unichar_ids()

void WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 343 of file ratngs.cpp.

343  {
344  for (int i = 0; i < length_ / 2; ++i) {
345  UNICHAR_ID tmp_id = unichar_ids_[i];
346  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
347  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
348  }
349  if (length_ % 2 != 0) {
350  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
351  }
352 }
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:645
int UNICHAR_ID
Definition: unichar.h:33

◆ ScriptPositionOf()

ScriptPos WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 615 of file ratngs.cpp.

618  {
620  int top = blob_box.top();
621  int bottom = blob_box.bottom();
622  int min_bottom, max_bottom, min_top, max_top;
624  &min_bottom, &max_bottom,
625  &min_top, &max_top);
626 
627  int sub_thresh_top = min_top - kMinSubscriptOffset;
628  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
629  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
630  if (bottom <= kMaxDropCapBottom) {
631  retval = tesseract::SP_DROPCAP;
632  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
633  retval = tesseract::SP_SUBSCRIPT;
634  } else if (bottom > sup_thresh_bot) {
635  retval = tesseract::SP_SUPERSCRIPT;
636  }
637 
638  if (print_debug) {
639  const char *pos = ScriptPosToString(retval);
640  tprintf("%s Character %s[bot:%d top: %d] "
641  "bot_range[%d,%d] top_range[%d, %d] "
642  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
644  bottom, top,
645  min_bottom, max_bottom, min_top, max_top,
646  sub_thresh_bot, sub_thresh_top,
647  sup_thresh_bot);
648  }
649  return retval;
650 }
const int kBlnBaselineOffset
Definition: normalis.h:29
const int kMinSubscriptOffset
Definition: ratngs.cpp:41
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
const int kMaxDropCapBottom
Definition: ratngs.cpp:45
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
inT16 bottom() const
Definition: rect.h:61
#define tprintf(...)
Definition: tprintf.h:31
inT16 top() const
Definition: rect.h:54
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const int kMinSuperscriptOffset
Definition: ratngs.cpp:43
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:180

◆ set_adjust_factor()

void WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 307 of file ratngs.h.

307  {
308  adjust_factor_ = factor;
309  }

◆ set_blob_choice()

void WERD_CHOICE::set_blob_choice ( int  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 290 of file ratngs.cpp.

291  {
292  unichar_ids_[index] = blob_choice->unichar_id();
293  script_pos_[index] = tesseract::SP_NORMAL;
294  state_[index] = blob_count;
295  certainties_[index] = blob_choice->certainty();
296 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
float certainty() const
Definition: ratngs.h:82

◆ set_certainty()

void WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 370 of file ratngs.h.

370  {
371  certainty_ = new_val;
372  }

◆ set_dangerous_ambig_found_()

void WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 364 of file ratngs.h.

364  {
365  dangerous_ambig_found_ = value;
366  }

◆ set_length()

void WERD_CHOICE::set_length ( int  len)
inline

Definition at line 379 of file ratngs.h.

379  {
380  ASSERT_HOST(reserved_ >= len);
381  length_ = len;
382  }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ set_permuter()

void WERD_CHOICE::set_permuter ( uinT8  perm)
inline

Definition at line 373 of file ratngs.h.

373  {
374  permuter_ = perm;
375  }

◆ set_rating()

void WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 367 of file ratngs.h.

367  {
368  rating_ = new_val;
369  }

◆ set_unichar_id() [1/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  index 
)
inline

Definition at line 357 of file ratngs.h.

357  {
358  assert(index < length_);
359  unichar_ids_[index] = unichar_id;
360  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313

◆ set_unichar_id() [2/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
int  index 
)
inline

Definition at line 462 of file ratngs.h.

463  {
464  assert(index < length_);
465  unichar_ids_[index] = unichar_id;
466  state_[index] = blob_count;
467  certainties_[index] = certainty;
468  script_pos_[index] = tesseract::SP_NORMAL;
469  rating_ += rating;
470  if (certainty < certainty_) {
471  certainty_ = certainty;
472  }
473  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
float certainty() const
Definition: ratngs.h:328
float rating() const
Definition: ratngs.h:325

◆ set_unichars_in_script_order()

bool WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 515 of file ratngs.h.

515  {
516  return unichars_in_script_order_ = in_script_order;
517  }

◆ set_x_heights()

void WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 340 of file ratngs.h.

340  {
341  min_x_height_ = min_height;
342  max_x_height_ = max_height;
343  }

◆ SetAllScriptPositions()

void WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 609 of file ratngs.cpp.

609  {
610  for (int i = 0; i < length_; ++i)
611  script_pos_[i] = position;
612 }

◆ SetScriptPositions() [1/2]

void WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word 
)

Definition at line 528 of file ratngs.cpp.

528  {
529  // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
530  // we don't have easy access to the flags Tesseract stores. Therefore, debug
531  // for this module is hard compiled in.
532  int debug = 0;
533 
534  // Initialize to normal.
535  for (int i = 0; i < length_; ++i)
536  script_pos_[i] = tesseract::SP_NORMAL;
537  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
538  return;
539  }
540 
541  int position_counts[4];
542  for (int i = 0; i < 4; i++) {
543  position_counts[i] = 0;
544  }
545 
546  int chunk_index = 0;
547  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
548  TBLOB* tblob = word->blobs[chunk_index];
549  int uni_id = unichar_id(blob_index);
550  TBOX blob_box = tblob->bounding_box();
551  if (state_ != NULL) {
552  for (int i = 1; i < state_[blob_index]; ++i) {
553  ++chunk_index;
554  tblob = word->blobs[chunk_index];
555  blob_box += tblob->bounding_box();
556  }
557  }
558  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
559  uni_id);
560  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
561  script_pos_[blob_index] = tesseract::SP_NORMAL;
562  }
563  position_counts[script_pos_[blob_index]]++;
564  }
565  // If almost everything looks like a superscript or subscript,
566  // we most likely just got the baseline wrong.
567  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
568  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
569  if (debug >= 2) {
570  tprintf("Most characters of %s are subscript or superscript.\n"
571  "That seems wrong, so I'll assume we got the baseline wrong\n",
572  unichar_string().string());
573  }
574  for (int i = 0; i < length_; i++) {
575  ScriptPos sp = script_pos_[i];
577  position_counts[sp]--;
578  position_counts[tesseract::SP_NORMAL]++;
579  script_pos_[i] = tesseract::SP_NORMAL;
580  }
581  }
582  }
583 
584  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
585  debug >= 2) {
586  tprintf("SetScriptPosition on %s\n", unichar_string().string());
587  int chunk_index = 0;
588  for (int blob_index = 0; blob_index < length_; ++blob_index) {
589  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
590  TBLOB* tblob = word->blobs[chunk_index];
591  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
592  unichar_id(blob_index));
593  }
594  chunk_index += state_ != NULL ? state_[blob_index] : 1;
595  }
596  }
597 }
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const STRING & unichar_string() const
Definition: ratngs.h:525
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:615
TBOX bounding_box() const
Definition: blobs.cpp:482
int TotalOfStates() const
Definition: ratngs.cpp:697
int NumBlobs() const
Definition: blobs.h:425
#define tprintf(...)
Definition: tprintf.h:31
Definition: blobs.h:261
Definition: rect.h:30
bool empty() const
Definition: genericvector.h:84

◆ SetScriptPositions() [2/2]

void WERD_CHOICE::SetScriptPositions ( const tesseract::ScriptPos positions,
int  length 
)

Definition at line 599 of file ratngs.cpp.

600  {
601  ASSERT_HOST(length == length_);
602  if (positions != script_pos_) {
603  delete [] script_pos_;
604  script_pos_ = new ScriptPos[length];
605  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
606  }
607 }
int length() const
Definition: ratngs.h:301
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ shallow_copy()

WERD_CHOICE WERD_CHOICE::shallow_copy ( int  start,
int  end 
) const

Definition at line 392 of file ratngs.cpp.

392  {
393  ASSERT_HOST(start >= 0 && start <= length_);
394  ASSERT_HOST(end >= 0 && end <= length_);
395  if (end < start) { end = start; }
396  WERD_CHOICE retval(unicharset_, end - start);
397  for (int i = start; i < end; i++) {
398  retval.append_unichar_id_space_allocated(
399  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
400  }
401  return retval;
402 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ state()

int WERD_CHOICE::state ( int  index) const
inline

Definition at line 317 of file ratngs.h.

317  {
318  return state_[index];
319  }

◆ string_and_lengths()

void WERD_CHOICE::string_and_lengths ( STRING word_str,
STRING word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 427 of file ratngs.cpp.

428  {
429  *word_str = "";
430  if (word_lengths_str != NULL) *word_lengths_str = "";
431  for (int i = 0; i < length_; ++i) {
432  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
433  *word_str += ch;
434  if (word_lengths_str != NULL) {
435  *word_lengths_str += strlen(ch);
436  }
437  }
438 }
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:274

◆ TotalOfStates()

int WERD_CHOICE::TotalOfStates ( ) const

Definition at line 697 of file ratngs.cpp.

697  {
698  int total_chunks = 0;
699  for (int i = 0; i < length_; ++i) {
700  total_chunks += state_[i];
701  }
702  return total_chunks;
703 }

◆ unichar_id()

UNICHAR_ID WERD_CHOICE::unichar_id ( int  index) const
inline

Definition at line 313 of file ratngs.h.

313  {
314  assert(index < length_);
315  return unichar_ids_[index];
316  }

◆ unichar_ids()

const UNICHAR_ID* WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 310 of file ratngs.h.

310  {
311  return unichar_ids_;
312  }

◆ unichar_lengths()

const STRING& WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 532 of file ratngs.h.

532  {
533  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
534  return unichar_lengths_;
535  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427

◆ unichar_string()

const STRING& WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 525 of file ratngs.h.

525  {
526  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
527  return unichar_string_;
528  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427

◆ unichars_in_script_order()

bool WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 519 of file ratngs.h.

519  {
520  return unichars_in_script_order_;
521  }

◆ unicharset()

const UNICHARSET* WERD_CHOICE::unicharset ( ) const
inline

Definition at line 298 of file ratngs.h.

298  {
299  return unicharset_;
300  }

◆ UpdateStateForSplit()

void WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 685 of file ratngs.cpp.

685  {
686  int total_chunks = 0;
687  for (int i = 0; i < length_; ++i) {
688  total_chunks += state_[i];
689  if (total_chunks > blob_position) {
690  ++state_[i];
691  return;
692  }
693  }
694 }

Member Data Documentation

◆ kBadRating

const float WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 273 of file ratngs.h.


The documentation for this class was generated from the following files: