tesseract  3.05.02
tesseract::BoxChar Class Reference

#include <boxchar.h>

Public Member Functions

 BoxChar (const char *utf8_str, int len)
 
 ~BoxChar ()
 
const string & ch () const
 
const Box * box () const
 
const int & page () const
 
void AddBox (int x, int y, int width, int height)
 
void set_page (int page)
 
string * mutable_ch ()
 
Box * mutable_box ()
 
bool operator< (const BoxChar &other) const
 

Static Public Member Functions

static void TranslateBoxes (int xshift, int yshift, vector< BoxChar *> *boxes)
 
static void PrepareToWrite (vector< BoxChar *> *boxes)
 
static void InsertNewlines (bool rtl_rules, bool vertical_rules, vector< BoxChar *> *boxes)
 
static void InsertSpaces (bool rtl_rules, bool vertical_rules, vector< BoxChar *> *boxes)
 
static void ReorderRTLText (vector< BoxChar *> *boxes)
 
static bool ContainsMostlyRTL (const vector< BoxChar *> &boxes)
 
static bool MostlyVertical (const vector< BoxChar *> &boxes)
 
static int TotalByteLength (const vector< BoxChar *> &boxes)
 
static void RotateBoxes (float rotation, int xcenter, int ycenter, int start_box, int end_box, vector< BoxChar *> *boxes)
 
static void WriteTesseractBoxFile (const string &name, int height, const vector< BoxChar *> &boxes)
 
static string GetTesseractBoxStr (int height, const vector< BoxChar *> &boxes)
 

Detailed Description

Definition at line 40 of file boxchar.h.

Constructor & Destructor Documentation

◆ BoxChar()

tesseract::BoxChar::BoxChar ( const char *  utf8_str,
int  len 
)

Definition at line 41 of file boxchar.cpp.

41  : ch_(utf8_str, len) {
42  box_ = NULL;
43 }

◆ ~BoxChar()

tesseract::BoxChar::~BoxChar ( )

Definition at line 45 of file boxchar.cpp.

45 { boxDestroy(&box_); }

Member Function Documentation

◆ AddBox()

void tesseract::BoxChar::AddBox ( int  x,
int  y,
int  width,
int  height 
)

Definition at line 47 of file boxchar.cpp.

47  {
48  box_ = boxCreate(x, y, width, height);
49 }

◆ box()

const Box* tesseract::BoxChar::box ( ) const
inline

Definition at line 48 of file boxchar.h.

48 { return box_; }

◆ ch()

const string& tesseract::BoxChar::ch ( ) const
inline

Definition at line 47 of file boxchar.h.

47 { return ch_; }

◆ ContainsMostlyRTL()

bool tesseract::BoxChar::ContainsMostlyRTL ( const vector< BoxChar *> &  boxes)
static

Definition at line 215 of file boxchar.cpp.

215  {
216  int num_rtl = 0, num_ltr = 0;
217  for (int i = 0; i < boxes.size(); ++i) {
218  // Convert the unichar to UTF32 representation
219  GenericVector<char32> uni_vector;
220  if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) {
221  tprintf("Illegal utf8 in boxchar %d string:%s = ", i,
222  boxes[i]->ch_.c_str());
223  for (int c = 0; c < boxes[i]->ch_.size(); ++c) {
224  tprintf(" 0x%x", boxes[i]->ch_[c]);
225  }
226  tprintf("\n");
227  continue;
228  }
229  for (int j = 0; j < uni_vector.size(); ++j) {
230  UCharDirection dir = u_charDirection(uni_vector[j]);
231  if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
232  dir == U_ARABIC_NUMBER) {
233  ++num_rtl;
234  } else {
235  ++num_ltr;
236  }
237  }
238  }
239  return num_rtl > num_ltr;
240 }
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72

◆ GetTesseractBoxStr()

string tesseract::BoxChar::GetTesseractBoxStr ( int  height,
const vector< BoxChar *> &  boxes 
)
static

Definition at line 300 of file boxchar.cpp.

301  {
302  string output;
303  char buffer[kMaxLineLength];
304  for (int i = 0; i < boxes.size(); ++i) {
305  const Box* box = boxes[i]->box_;
306  if (box == NULL) {
307  tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
308  return "";
309  }
310  int nbytes =
311  snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
312  boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
313  box->x + box->w, height - box->y, boxes[i]->page_);
314  output.append(buffer, nbytes);
315  }
316  return output;
317 }
#define tprintf(...)
Definition: tprintf.h:31
const int kMaxLineLength
Definition: boxchar.cpp:291
const Box * box() const
Definition: boxchar.h:48

◆ InsertNewlines()

void tesseract::BoxChar::InsertNewlines ( bool  rtl_rules,
bool  vertical_rules,
vector< BoxChar *> *  boxes 
)
static

Definition at line 81 of file boxchar.cpp.

82  {
83  int prev_i = -1;
84  int max_shift = 0;
85  for (int i = 0; i < boxes->size(); ++i) {
86  Box* box = (*boxes)[i]->box_;
87  if (box == NULL) {
88  if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) {
89  // Erase null boxes at the start of a line and after another null box.
90  do {
91  delete (*boxes)[i];
92  boxes->erase(boxes->begin() + i);
93  --i;
94  } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL);
95  }
96  continue;
97  }
98  if (prev_i >= 0) {
99  Box* prev_box = (*boxes)[prev_i]->box_;
100  int shift = box->x - prev_box->x;
101  if (vertical_rules) {
102  shift = box->y - prev_box->y;
103  } else if (rtl_rules) {
104  shift = -shift;
105  }
106  if (-shift > max_shift) {
107  // This is a newline.
108  int width = prev_box->w;
109  int height = prev_box->h;
110  int x = prev_box->x + width;
111  int y = prev_box->y;
112  if (vertical_rules) {
113  x = prev_box->x;
114  y = prev_box->y + height;
115  } else if (rtl_rules) {
116  x = prev_box->x - width;
117  if (x < 0) {
118  tprintf("prev x = %d, width=%d\n", prev_box->x, width);
119  x = 0;
120  }
121  }
122  if (prev_i == i - 1) {
123  // New character needed.
124  BoxChar* new_box = new BoxChar("\t", 1);
125  new_box->AddBox(x, y, width, height);
126  new_box->page_ = (*boxes)[i]->page_;
127  boxes->insert(boxes->begin() + i, new_box);
128  ++i;
129  } else {
130  (*boxes)[i - 1]->AddBox(x, y, width, height);
131  (*boxes)[i - 1]->ch_ = "\t";
132  }
133  max_shift = 0;
134  } else if (shift > max_shift) {
135  max_shift = shift;
136  }
137  }
138  prev_i = i;
139  }
140 }
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:41
#define tprintf(...)
Definition: tprintf.h:31
const Box * box() const
Definition: boxchar.h:48

◆ InsertSpaces()

void tesseract::BoxChar::InsertSpaces ( bool  rtl_rules,
bool  vertical_rules,
vector< BoxChar *> *  boxes 
)
static

Definition at line 144 of file boxchar.cpp.

145  {
146  // After InsertNewlines, any remaining null boxes are not newlines, and are
147  // singletons, so add a box to each remaining null box.
148  for (int i = 1; i + 1 < boxes->size(); ++i) {
149  Box* box = (*boxes)[i]->box_;
150  if (box == NULL) {
151  Box* prev = (*boxes)[i - 1]->box_;
152  Box* next = (*boxes)[i + 1]->box_;
153  ASSERT_HOST(prev != NULL && next != NULL);
154  int top = MIN(prev->y, next->y);
155  int bottom = MAX(prev->y + prev->h, next->y + next->h);
156  int left = prev->x + prev->w;
157  int right = next->x;
158  if (vertical_rules) {
159  top = prev->y + prev->h;
160  bottom = next->y;
161  left = MIN(prev->x, next->x);
162  right = MAX(prev->x + prev->w, next->x + next->w);
163  } else if (rtl_rules) {
164  // With RTL we have to account for BiDi.
165  // Right becomes the min left of all prior boxes back to the first
166  // space or newline.
167  right = prev->x;
168  left = next->x + next->w;
169  for (int j = i - 2;
170  j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
171  --j) {
172  prev = (*boxes)[j]->box_;
173  ASSERT_HOST(prev != NULL);
174  if (prev->x < right) {
175  right = prev->x;
176  }
177  }
178  // Left becomes the max right of all next boxes forward to the first
179  // space or newline.
180  for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL &&
181  (*boxes)[j]->ch_ != "\t";
182  ++j) {
183  next = (*boxes)[j]->box_;
184  if (next->x + next->w > left) {
185  left = next->x + next->w;
186  }
187  }
188  }
189  // Italic and stylized characters can produce negative spaces, which
190  // Leptonica doesn't like, so clip to a positive size.
191  if (right <= left) right = left + 1;
192  if (bottom <= top) bottom = top + 1;
193  (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
194  (*boxes)[i]->ch_ = " ";
195  }
196  }
197 }
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
const Box * box() const
Definition: boxchar.h:48
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ MostlyVertical()

bool tesseract::BoxChar::MostlyVertical ( const vector< BoxChar *> &  boxes)
static

Definition at line 244 of file boxchar.cpp.

244  {
245  inT64 total_dx = 0, total_dy = 0;
246  for (int i = 1; i < boxes.size(); ++i) {
247  if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL &&
248  boxes[i - 1]->page_ == boxes[i]->page_) {
249  int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
250  int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
251  if (abs(dx) > abs(dy) * kMinNewlineRatio ||
252  abs(dy) > abs(dx) * kMinNewlineRatio) {
253  total_dx += dx * dx;
254  total_dy += dy * dy;
255  }
256  }
257  }
258  return total_dy > total_dx;
259 }
long long int inT64
Definition: host.h:41
const int kMinNewlineRatio
Definition: boxchar.cpp:37

◆ mutable_box()

Box* tesseract::BoxChar::mutable_box ( )
inline

Definition at line 58 of file boxchar.h.

58 { return box_; }

◆ mutable_ch()

string* tesseract::BoxChar::mutable_ch ( )
inline

Definition at line 57 of file boxchar.h.

57 { return &ch_; }

◆ operator<()

bool tesseract::BoxChar::operator< ( const BoxChar other) const
inline

Definition at line 62 of file boxchar.h.

62  {
63  if (box_ == NULL) return true;
64  if (other.box_ == NULL) return false;
65  return box_->x < other.box_->x;
66  }

◆ page()

const int& tesseract::BoxChar::page ( ) const
inline

Definition at line 49 of file boxchar.h.

49 { return page_; }

◆ PrepareToWrite()

void tesseract::BoxChar::PrepareToWrite ( vector< BoxChar *> *  boxes)
static

Definition at line 66 of file boxchar.cpp.

66  {
67  bool rtl_rules = ContainsMostlyRTL(*boxes);
68  bool vertical_rules = MostlyVertical(*boxes);
69  InsertNewlines(rtl_rules, vertical_rules, boxes);
70  InsertSpaces(rtl_rules, vertical_rules, boxes);
71  for (int i = 0; i < boxes->size(); ++i) {
72  if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i);
73  }
74  if (rtl_rules) {
75  ReorderRTLText(boxes);
76  }
77 }
static void ReorderRTLText(vector< BoxChar *> *boxes)
Definition: boxchar.cpp:201
#define tprintf(...)
Definition: tprintf.h:31
static void InsertNewlines(bool rtl_rules, bool vertical_rules, vector< BoxChar *> *boxes)
Definition: boxchar.cpp:81
static bool ContainsMostlyRTL(const vector< BoxChar *> &boxes)
Definition: boxchar.cpp:215
static void InsertSpaces(bool rtl_rules, bool vertical_rules, vector< BoxChar *> *boxes)
Definition: boxchar.cpp:144
static bool MostlyVertical(const vector< BoxChar *> &boxes)
Definition: boxchar.cpp:244

◆ ReorderRTLText()

void tesseract::BoxChar::ReorderRTLText ( vector< BoxChar *> *  boxes)
static

Definition at line 201 of file boxchar.cpp.

201  {
202  // After adding newlines and spaces, this task is simply a matter of sorting
203  // by left each group of boxes between newlines.
204  BoxCharPtrSort sorter;
205  int end = 0;
206  for (int start = 0; start < boxes->size(); start = end + 1) {
207  end = start + 1;
208  while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
209  sort(boxes->begin() + start, boxes->begin() + end, sorter);
210  }
211 }

◆ RotateBoxes()

void tesseract::BoxChar::RotateBoxes ( float  rotation,
int  xcenter,
int  ycenter,
int  start_box,
int  end_box,
vector< BoxChar *> *  boxes 
)
static

Definition at line 272 of file boxchar.cpp.

274  {
275  Boxa* orig = boxaCreate(0);
276  for (int i = start_box; i < end_box; ++i) {
277  BOX* box = (*boxes)[i]->box_;
278  if (box) boxaAddBox(orig, box, L_CLONE);
279  }
280  Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
281  boxaDestroy(&orig);
282  for (int i = start_box, box_ind = 0; i < end_box; ++i) {
283  if ((*boxes)[i]->box_) {
284  boxDestroy(&((*boxes)[i]->box_));
285  (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
286  }
287  }
288  boxaDestroy(&rotated);
289 }
const Box * box() const
Definition: boxchar.h:48

◆ set_page()

void tesseract::BoxChar::set_page ( int  page)
inline

Definition at line 55 of file boxchar.h.

55 { page_ = page; }
const int & page() const
Definition: boxchar.h:49

◆ TotalByteLength()

int tesseract::BoxChar::TotalByteLength ( const vector< BoxChar *> &  boxes)
static

Definition at line 263 of file boxchar.cpp.

263  {
264  int total_length = 0;
265  for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size();
266  return total_length;
267 }

◆ TranslateBoxes()

void tesseract::BoxChar::TranslateBoxes ( int  xshift,
int  yshift,
vector< BoxChar *> *  boxes 
)
static

Definition at line 52 of file boxchar.cpp.

53  {
54  for (int i = 0; i < boxes->size(); ++i) {
55  BOX* box = (*boxes)[i]->box_;
56  if (box != NULL) {
57  box->x += xshift;
58  box->y += yshift;
59  }
60  }
61 }
const Box * box() const
Definition: boxchar.h:48

◆ WriteTesseractBoxFile()

void tesseract::BoxChar::WriteTesseractBoxFile ( const string &  name,
int  height,
const vector< BoxChar *> &  boxes 
)
static

Definition at line 293 of file boxchar.cpp.

294  {
295  string output = GetTesseractBoxStr(height, boxes);
297 }
static string GetTesseractBoxStr(int height, const vector< BoxChar *> &boxes)
Definition: boxchar.cpp:300
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:53

The documentation for this class was generated from the following files: