tesseract  3.05.02
lm_pain_points.h
Go to the documentation of this file.
1 // File: lm_pain_points.h
3 // Description: Functions that utilize the knowledge about the properties
4 // of the paths explored by the segmentation search in order
5 // to generate "pain points" - the locations in the ratings
6 // matrix which should be classified next.
7 // Author: Rika Antonova
8 // Created: Mon Jun 20 11:26:43 PST 2012
9 //
10 // (C) Copyright 2012, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
24 #define TESSERACT_WORDREC_PAIN_POINTS_H_
25 
26 #include "associate.h"
27 #include "dict.h"
28 #include "genericheap.h"
29 #include "lm_state.h"
30 
31 namespace tesseract {
32 
33 // Heap of pain points used for determining where to chop/join.
35 
36 // Types of pain points (ordered in the decreasing level of importance).
42 
44 };
45 
46 static const char * const LMPainPointsTypeName[] = {
47  "LM_PPTYPE_BLAMER",
48  "LM_PPTYPE_AMBIGS",
49  "LM_PPTYPE_PATH",
50  "LM_PPTYPE_SHAPE",
51 };
52 
53 class LMPainPoints {
54  public:
55 
57  // If there is a significant drop in character ngram probability or a
58  // dangerous ambiguity make the thresholds on what blob combinations
59  // can be classified looser.
60  static const float kLooseMaxCharWhRatio;
61  // Returns a description of the type of a pain point.
62  static const char* PainPointDescription(LMPainPointsType type) {
63  return LMPainPointsTypeName[type];
64  }
65 
66  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb) :
67  max_heap_size_(max), max_char_wh_ratio_(rat), fixed_pitch_(fp),
68  dict_(d), debug_level_(deb) {}
70 
71  // Returns true if the heap of pain points of pp_type is not empty().
72  inline bool HasPainPoints(LMPainPointsType pp_type) const {
73  return !pain_points_heaps_[pp_type].empty();
74  }
75 
76  // Dequeues the next pain point from the pain points queue and copies
77  // its contents and priority to *pp and *priority.
78  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
79  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
80 
81  // Clears pain points heap.
82  void Clear() {
83  for (int h = 0; h < LM_PPTYPE_NUM; ++h) pain_points_heaps_[h].clear();
84  }
85 
86  // For each cell, generate a "pain point" if the cell is not classified
87  // and has a left or right neighbor that was classified.
88  void GenerateInitial(WERD_RES *word_res);
89 
90  // Generate pain points from the given path.
91  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,
92  WERD_RES *word_res);
93 
94  // Generate pain points from dangerous ambiguities in best choice.
95  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,
96  WERD_RES *word_res);
97 
98  // Generate a pain point for the blamer.
99  bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res,
100  int col, int row) {
101  return GeneratePainPoint(col, row, LM_PPTYPE_BLAMER, 0.0, false,
102  max_char_wh_ratio, word_res);
103  }
104 
105  // Adds a pain point to classify chunks_record->ratings(col, row).
106  // Returns true if a new pain point was added to an appropriate heap.
107  // Pain point priority is set to special_priority for pain points of
108  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
109  // AssociateStats::gap_sum is used.
110  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
111  float special_priority, bool ok_to_extend,
112  float max_char_wh_ratio,
113  WERD_RES *word_res);
114 
115  // Adjusts the pain point coordinates to cope with expansion of the ratings
116  // matrix due to a split of the blob with the given index.
117  void RemapForSplit(int index);
118 
119  private:
120  // Priority queues containing pain points generated by the language model
121  // The priority is set by the language model components, adjustments like
122  // seam cost and width priority are factored into the priority.
123  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
124  // Maximum number of points to keep in the heap.
125  int max_heap_size_;
126  // Maximum character width/height ratio.
127  float max_char_wh_ratio_;
128  // Set to true if fixed pitch should be assumed.
129  bool fixed_pitch_;
130  // Cached pointer to dictionary.
131  const Dict *dict_;
132  // Debug level for print statements.
133  int debug_level_;
134 };
135 
136 } // namespace tesseract
137 
138 #endif // TESSERACT_WORDREC_PAIN_POINTS_H_
void RemapForSplit(int index)
bool empty() const
Definition: genericheap.h:68
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
GenericHeap< MatrixCoordPair > PainPointHeap
static const float kDefaultPainPointPriorityAdjustment
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
static const float kLooseMaxCharWhRatio
static const char * PainPointDescription(LMPainPointsType type)
void GenerateInitial(WERD_RES *word_res)
bool HasPainPoints(LMPainPointsType pp_type) const