tesseract  3.05.02
werd.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word.c
3  * Description: Code for the WERD class.
4  * Author: Ray Smith
5  * Created: Tue Oct 08 14:32:12 BST 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef WERD_H
21 #define WERD_H
22 
23 #include "params.h"
24 #include "bits16.h"
25 #include "elst2.h"
26 #include "strngs.h"
27 #include "blckerr.h"
28 #include "stepblob.h"
29 
31 {
32  W_SEGMENTED, //< correctly segmented
33  W_ITALIC, //< italic text
34  W_BOLD, //< bold text
35  W_BOL, //< start of line
36  W_EOL, //< end of line
37  W_NORMALIZED, //< flags
38  W_SCRIPT_HAS_XHEIGHT, //< x-height concept makes sense.
39  W_SCRIPT_IS_LATIN, //< Special case latin for y. splitting.
40  W_DONT_CHOP, //< fixed pitch chopped
41  W_REP_CHAR, //< repeated character
42  W_FUZZY_SP, //< fuzzy space
43  W_FUZZY_NON, //< fuzzy nonspace
44  W_INVERSE //< white on black
45 };
46 
48 {
49  /* Display flags bit number allocations */
50  DF_BOX, //< Bounding box
51  DF_TEXT, //< Correct ascii
52  DF_POLYGONAL, //< Polyg approx
53  DF_EDGE_STEP, //< Edge steps
54  DF_BN_POLYGONAL, //< BL normalisd polyapx
55  DF_BLAMER //< Blamer information
56 };
57 
58 class ROW; //forward decl
59 
60 class WERD : public ELIST2_LINK {
61  public:
62  WERD() {}
63  // WERD constructed with:
64  // blob_list - blobs of the word (we take this list's contents)
65  // blanks - number of blanks before the word
66  // text - correct text (outlives WERD)
67  WERD(C_BLOB_LIST *blob_list, uinT8 blanks, const char *text);
68 
69  // WERD constructed from:
70  // blob_list - blobs in the word
71  // clone - werd to clone flags, etc from.
72  WERD(C_BLOB_LIST *blob_list, WERD *clone);
73 
74  // Construct a WERD from a single_blob and clone the flags from this.
75  // W_BOL and W_EOL flags are set according to the given values.
76  WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob);
77 
78  ~WERD() {
79  }
80 
81  // assignment
82  WERD & operator= (const WERD &source);
83 
84  // This method returns a new werd constructed using the blobs in the input
85  // all_blobs list, which correspond to the blobs in this werd object. The
86  // blobs used to construct the new word are consumed and removed from the
87  // input all_blobs list.
88  // Returns NULL if the word couldn't be constructed.
89  // Returns original blobs for which no matches were found in the output list
90  // orphan_blobs (appends).
91  WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs,
92  C_BLOB_LIST *orphan_blobs);
93 
94  // Accessors for reject / DUFF blobs in various formats
95  C_BLOB_LIST *rej_cblob_list() { // compact format
96  return &rej_cblobs;
97  }
98 
99  // Accessors for good blobs in various formats.
100  C_BLOB_LIST *cblob_list() { // get compact blobs
101  return &cblobs;
102  }
103 
104  uinT8 space() { // access function
105  return blanks;
106  }
107  void set_blanks(uinT8 new_blanks) {
108  blanks = new_blanks;
109  }
110  int script_id() const {
111  return script_id_;
112  }
113  void set_script_id(int id) {
114  script_id_ = id;
115  }
116 
117  // Returns the (default) bounding box including all the dots.
118  TBOX bounding_box() const; // compute bounding box
119  // Returns the bounding box including the desired combination of upper and
120  // lower noise/diacritic elements.
121  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
122  // Returns the bounding box of only the good blobs.
123  TBOX true_bounding_box() const;
124 
125  const char *text() const { return correct.string(); }
126  void set_text(const char *new_text) { correct = new_text; }
127 
128  BOOL8 flag(WERD_FLAGS mask) const { return flags.bit(mask); }
129  void set_flag(WERD_FLAGS mask, BOOL8 value) { flags.set_bit(mask, value); }
130 
131  BOOL8 display_flag(uinT8 flag) const { return disp_flags.bit(flag); }
133  disp_flags.set_bit(flag, value);
134  }
135 
136  WERD *shallow_copy(); // shallow copy word
137 
138  // reposition word by vector
139  void move(const ICOORD vec);
140 
141  // join other's blobs onto this werd, emptying out other.
142  void join_on(WERD* other);
143 
144  // copy other's blobs onto this word, leaving other intact.
145  void copy_on(WERD* other);
146 
147  // tprintf word metadata (but not blob innards)
148  void print();
149 
150  #ifndef GRAPHICS_DISABLED
151  // plot word on window in a uniform colour
152  void plot(ScrollView *window, ScrollView::Color colour);
153 
154  // Get the next color in the (looping) rainbow.
156 
157  // plot word on window in a rainbow of colours
158  void plot(ScrollView *window);
159 
160  // plot rejected blobs in a rainbow of colours
161  void plot_rej_blobs(ScrollView *window);
162  #endif // GRAPHICS_DISABLED
163 
164  // Removes noise from the word by moving small outlines to the rej_cblobs
165  // list, based on the size_threshold.
166  void CleanNoise(float size_threshold);
167 
168  // Extracts all the noise outlines and stuffs the pointers into the given
169  // vector of outlines. Afterwards, the outlines vector owns the pointers.
171  // Adds the selected outlines to the indcated real blobs, and puts the rest
172  // back in rej_cblobs where they came from. Where the target_blobs entry is
173  // NULL, a run of wanted outlines is put into a single new blob.
174  // Ownership of the outlines is transferred back to the word. (Hence
175  // GenericVector and not PointerVector.)
176  // Returns true if any new blob was added to the start of the word, which
177  // suggests that it might need joining to the word before it, and likewise
178  // sets make_next_word_fuzzy true if any new blob was added to the end.
179  bool AddSelectedOutlines(const GenericVector<bool> &wanted,
180  const GenericVector<C_BLOB *> &target_blobs,
181  const GenericVector<C_OUTLINE *> &outlines,
182  bool *make_next_word_fuzzy);
183 
184  private:
185  uinT8 blanks; // no of blanks
186  uinT8 dummy; // padding
187  BITS16 flags; // flags about word
188  BITS16 disp_flags; // display flags
189  inT16 script_id_; // From unicharset.
190  STRING correct; // correct text
191  C_BLOB_LIST cblobs; // compacted blobs
192  C_BLOB_LIST rej_cblobs; // DUFF blobs
193 };
194 
196 #include "ocrrow.h" // placed here due to
197 // compare words by increasing order of left edge, suitable for qsort(3)
198 int word_comparator(const void *word1p, const void *word2p);
199 #endif
void copy_on(WERD *other)
Definition: werd.cpp:234
TBOX true_bounding_box() const
Definition: werd.cpp:181
TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const
Definition: werd.cpp:164
short inT16
Definition: host.h:33
BOOL8 bit(uinT8 bit_num) const
Definition: bits16.h:56
int script_id() const
Definition: werd.h:110
Definition: werd.h:36
integer coordinate
Definition: points.h:30
void CleanNoise(float size_threshold)
Definition: werd.cpp:506
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
Definition: werd.h:55
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:297
void join_on(WERD *other)
Definition: werd.cpp:211
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:306
unsigned char uinT8
Definition: host.h:32
Definition: werd.h:44
void set_display_flag(uinT8 flag, BOOL8 value)
Definition: werd.h:132
unsigned char BOOL8
Definition: host.h:46
BOOL8 display_flag(uinT8 flag) const
Definition: werd.h:131
void set_bit(uinT8 bit_num, BOOL8 value)
Definition: bits16.h:47
Definition: werd.h:34
WERD()
Definition: werd.h:62
void print()
Definition: werd.cpp:266
const char * string() const
Definition: strngs.cpp:201
uinT8 space()
Definition: werd.h:104
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: werd.h:60
Definition: werd.h:35
void plot_rej_blobs(ScrollView *window)
Definition: werd.cpp:337
WERD * shallow_copy()
Definition: werd.cpp:352
int word_comparator(const void *word1p, const void *word2p)
Definition: werd.cpp:394
Definition: werd.h:33
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:950
Definition: werd.h:51
WERD_FLAGS
Definition: werd.h:30
Definition: ocrrow.h:32
Definition: strngs.h:44
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:530
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548
const char * text() const
Definition: werd.h:125
~WERD()
Definition: werd.h:78
WERD & operator=(const WERD &source)
Definition: werd.cpp:369
WERD * ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs)
Definition: werd.cpp:412
Definition: rect.h:30
void move(const ICOORD vec)
Definition: werd.cpp:198
Definition: bits16.h:25
void set_script_id(int id)
Definition: werd.h:113
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
DISPLAY_FLAGS
Definition: werd.h:47
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
TBOX bounding_box() const
Definition: werd.cpp:160
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
void set_text(const char *new_text)
Definition: werd.h:126
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
Definition: werd.h:50