tesseract  3.05.02
strngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: strngs.c (Formerly strings.c)
3  * Description: STRING class functions.
4  * Author: Ray Smith
5  * Created: Fri Feb 15 09:13:30 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "strngs.h"
21 
22 #include <assert.h>
23 
24 #include "genericvector.h"
25 #include "helpers.h"
26 #include "serialis.h"
27 #include "tprintf.h"
28 
29 using tesseract::TFile;
30 
31 // Size of buffer needed to host the decimal representation of the maximum
32 // possible length of an int (in 64 bits), being -<20 digits>.
33 const int kMaxIntSize = 22;
34 // Size of buffer needed to host the decimal representation of the maximum
35 // possible length of a %.8g being -1.2345678e+999<nul> = 16.
36 const int kMaxDoubleSize = 16;
37 
38 /**********************************************************************
39  * STRING_HEADER provides metadata about the allocated buffer,
40  * including total capacity and how much used (strlen with '\0').
41  *
42  * The implementation hides this header at the start of the data
43  * buffer and appends the string on the end to keep sizeof(STRING)
44  * unchanged from earlier versions so serialization is not affected.
45  *
46  * The collection of MACROS provide different implementations depending
47  * on whether the string keeps track of its strlen or not so that this
48  * feature can be added in later when consumers don't modify the string
49  **********************************************************************/
50 
51 // Smallest string to allocate by default
52 const int kMinCapacity = 16;
53 
54 char* STRING::AllocData(int used, int capacity) {
55  data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
56 
57  // header is the metadata for this memory block
58  STRING_HEADER* header = GetHeader();
59  header->capacity_ = capacity;
60  header->used_ = used;
61  return GetCStr();
62 }
63 
64 void STRING::DiscardData() {
65  free_string((char *)data_);
66 }
67 
68 // This is a private method; ensure FixHeader is called (or used_ is well defined)
69 // beforehand
70 char* STRING::ensure_cstr(inT32 min_capacity) {
71  STRING_HEADER* orig_header = GetHeader();
72  if (min_capacity <= orig_header->capacity_)
73  return ((char *)this->data_) + sizeof(STRING_HEADER);
74 
75  // if we are going to grow bigger, than double our existing
76  // size, but if that still is not big enough then keep the
77  // requested capacity
78  if (min_capacity < 2 * orig_header->capacity_)
79  min_capacity = 2 * orig_header->capacity_;
80 
81  int alloc = sizeof(STRING_HEADER) + min_capacity;
82  STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
83 
84  memcpy(&new_header[1], GetCStr(), orig_header->used_);
85  new_header->capacity_ = min_capacity;
86  new_header->used_ = orig_header->used_;
87 
88  // free old memory, then rebind to new memory
89  DiscardData();
90  data_ = new_header;
91 
92  assert(InvariantOk());
93  return ((char *)data_) + sizeof(STRING_HEADER);
94 }
95 
96 // This is const, but is modifying a mutable field
97 // this way it can be used on const or non-const instances.
98 void STRING::FixHeader() const {
99  const STRING_HEADER* header = GetHeader();
100  if (header->used_ < 0)
101  header->used_ = strlen(GetCStr()) + 1;
102 }
103 
104 
106  // Empty STRINGs contain just the "\0".
107  memcpy(AllocData(1, kMinCapacity), "", 1);
108 }
109 
110 STRING::STRING(const STRING& str) {
111  str.FixHeader();
112  const STRING_HEADER* str_header = str.GetHeader();
113  int str_used = str_header->used_;
114  char *this_cstr = AllocData(str_used, str_used);
115  memcpy(this_cstr, str.GetCStr(), str_used);
116  assert(InvariantOk());
117 }
118 
119 STRING::STRING(const char* cstr) {
120  if (cstr == NULL) {
121  // Empty STRINGs contain just the "\0".
122  memcpy(AllocData(1, kMinCapacity), "", 1);
123  } else {
124  int len = strlen(cstr) + 1;
125  char* this_cstr = AllocData(len, len);
126  memcpy(this_cstr, cstr, len);
127  }
128  assert(InvariantOk());
129 }
130 
131 STRING::STRING(const char *data, int length) {
132  if (data == NULL) {
133  // Empty STRINGs contain just the "\0".
134  memcpy(AllocData(1, kMinCapacity), "", 1);
135  } else {
136  char* this_cstr = AllocData(length + 1, length + 1);
137  memcpy(this_cstr, data, length);
138  this_cstr[length] = '\0';
139  }
140 }
141 
143  DiscardData();
144 }
145 
146 // TODO(rays) Change all callers to use TFile and remove the old functions.
147 // Writes to the given file. Returns false in case of error.
148 bool STRING::Serialize(FILE* fp) const {
149  inT32 len = length();
150  if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
151  if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
152  return true;
153 }
154 // Writes to the given file. Returns false in case of error.
155 bool STRING::Serialize(TFile* fp) const {
156  inT32 len = length();
157  if (fp->FWrite(&len, sizeof(len), 1) != 1) return false;
158  if (fp->FWrite(GetCStr(), 1, len) != len) return false;
159  return true;
160 }
161 // Reads from the given file. Returns false in case of error.
162 // If swap is true, assumes a big/little-endian swap is needed.
163 bool STRING::DeSerialize(bool swap, FILE* fp) {
164  inT32 len;
165  if (fread(&len, sizeof(len), 1, fp) != 1) return false;
166  if (swap)
167  ReverseN(&len, sizeof(len));
168  truncate_at(len);
169  if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
170  return true;
171 }
172 // Reads from the given file. Returns false in case of error.
173 // If swap is true, assumes a big/little-endian swap is needed.
174 bool STRING::DeSerialize(bool swap, TFile* fp) {
175  inT32 len;
176  if (fp->FRead(&len, sizeof(len), 1) != 1) return false;
177  if (swap)
178  ReverseN(&len, sizeof(len));
179  truncate_at(len);
180  if (fp->FRead(GetCStr(), 1, len) != len) return false;
181  return true;
182 }
183 
184 // As DeSerialize, but only seeks past the data - hence a static method.
186  inT32 len;
187  if (fp->FRead(&len, sizeof(len), 1) != 1) return false;
188  if (swap) ReverseN(&len, sizeof(len));
189  return fp->FRead(NULL, 1, len) == len;
190 }
191 
192 BOOL8 STRING::contains(const char c) const {
193  return (c != '\0') && (strchr (GetCStr(), c) != NULL);
194 }
195 
197  FixHeader();
198  return GetHeader()->used_ - 1;
199 }
200 
201 const char* STRING::string() const {
202  const STRING_HEADER* header = GetHeader();
203  if (header->used_ == 0)
204  return NULL;
205 
206  // mark header length unreliable because tesseract might
207  // cast away the const and mutate the string directly.
208  header->used_ = -1;
209  return GetCStr();
210 }
211 
212 const char* STRING::c_str() const {
213  return string();
214 }
215 
216 /******
217  * The STRING_IS_PROTECTED interface adds additional support to migrate
218  * code that needs to modify the STRING in ways not otherwise supported
219  * without violating encapsulation.
220  *
221  * Also makes the [] operator return a const so it is immutable
222  */
223 #if STRING_IS_PROTECTED
224 const char& STRING::operator[](inT32 index) const {
225  return GetCStr()[index];
226 }
227 
228 void STRING::insert_range(inT32 index, const char* str, int len) {
229  // if index is outside current range, then also grow size of string
230  // to accmodate the requested range.
231  STRING_HEADER* this_header = GetHeader();
232  int used = this_header->used_;
233  if (index > used)
234  used = index;
235 
236  char* this_cstr = ensure_cstr(used + len + 1);
237  if (index < used) {
238  // move existing string from index to '\0' inclusive.
239  memmove(this_cstr + index + len,
240  this_cstr + index,
241  this_header->used_ - index);
242  } else if (len > 0) {
243  // We are going to overwrite previous null terminator, so write the new one.
244  this_cstr[this_header->used_ + len - 1] = '\0';
245 
246  // If the old header did not have the terminator,
247  // then we need to account for it now that we've added it.
248  // Otherwise it was already accounted for; we just moved it.
249  if (this_header->used_ == 0)
250  ++this_header->used_;
251  }
252 
253  // Write new string to index.
254  // The string is already terminated from the conditions above.
255  memcpy(this_cstr + index, str, len);
256  this_header->used_ += len;
257 
258  assert(InvariantOk());
259 }
260 
261 void STRING::erase_range(inT32 index, int len) {
262  char* this_cstr = GetCStr();
263  STRING_HEADER* this_header = GetHeader();
264 
265  memcpy(this_cstr+index, this_cstr+index+len,
266  this_header->used_ - index - len);
267  this_header->used_ -= len;
268  assert(InvariantOk());
269 }
270 
271 #else
273  ASSERT_HOST(index >= 0);
274  FixHeader();
275  char* this_cstr = ensure_cstr(index + 1);
276  this_cstr[index] = '\0';
277  GetHeader()->used_ = index + 1;
278  assert(InvariantOk());
279 }
280 
281 char& STRING::operator[](inT32 index) const {
282  // Code is casting away this const and mutating the string,
283  // so mark used_ as -1 to flag it unreliable.
284  GetHeader()->used_ = -1;
285  return ((char *)GetCStr())[index];
286 }
287 #endif
288 
289 void STRING::split(const char c, GenericVector<STRING> *splited) {
290  int start_index = 0;
291  int len = length();
292  for (int i = 0; i < len; i++) {
293  if ((*this)[i] == c) {
294  if (i != start_index) {
295  (*this)[i] = '\0';
296  splited->push_back(STRING(GetCStr() + start_index, i - start_index));
297  (*this)[i] = c;
298  }
299  start_index = i + 1;
300  }
301  }
302 
303  if (len != start_index) {
304  splited->push_back(STRING(GetCStr() + start_index, len - start_index));
305  }
306 }
307 
308 BOOL8 STRING::operator==(const STRING& str) const {
309  FixHeader();
310  str.FixHeader();
311  const STRING_HEADER* str_header = str.GetHeader();
312  const STRING_HEADER* this_header = GetHeader();
313  int this_used = this_header->used_;
314  int str_used = str_header->used_;
315 
316  return (this_used == str_used)
317  && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
318 }
319 
320 BOOL8 STRING::operator!=(const STRING& str) const {
321  FixHeader();
322  str.FixHeader();
323  const STRING_HEADER* str_header = str.GetHeader();
324  const STRING_HEADER* this_header = GetHeader();
325  int this_used = this_header->used_;
326  int str_used = str_header->used_;
327 
328  return (this_used != str_used)
329  || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
330 }
331 
332 BOOL8 STRING::operator!=(const char* cstr) const {
333  FixHeader();
334  const STRING_HEADER* this_header = GetHeader();
335 
336  if (cstr == NULL)
337  return this_header->used_ > 1; // either '\0' or NULL
338  else {
339  inT32 length = strlen(cstr) + 1;
340  return (this_header->used_ != length)
341  || (memcmp(GetCStr(), cstr, length) != 0);
342  }
343 }
344 
346  str.FixHeader();
347  const STRING_HEADER* str_header = str.GetHeader();
348  int str_used = str_header->used_;
349 
350  GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
351  char* this_cstr = ensure_cstr(str_used);
352  STRING_HEADER* this_header = GetHeader();
353 
354  memcpy(this_cstr, str.GetCStr(), str_used);
355  this_header->used_ = str_used;
356 
357  assert(InvariantOk());
358  return *this;
359 }
360 
362  FixHeader();
363  str.FixHeader();
364  const STRING_HEADER* str_header = str.GetHeader();
365  const char* str_cstr = str.GetCStr();
366  int str_used = str_header->used_;
367  int this_used = GetHeader()->used_;
368  char* this_cstr = ensure_cstr(this_used + str_used);
369 
370  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
371 
372  if (this_used > 1) {
373  memcpy(this_cstr + this_used - 1, str_cstr, str_used);
374  this_header->used_ += str_used - 1; // overwrite '\0'
375  } else {
376  memcpy(this_cstr, str_cstr, str_used);
377  this_header->used_ = str_used;
378  }
379 
380  assert(InvariantOk());
381  return *this;
382 }
383 
384 void STRING::add_str_int(const char* str, int number) {
385  if (str != NULL)
386  *this += str;
387  // Allow space for the maximum possible length of inT64.
388  char num_buffer[kMaxIntSize];
389  snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
390  num_buffer[kMaxIntSize - 1] = '\0';
391  *this += num_buffer;
392 }
393 // Appends the given string and double (as a %.8g) to this.
394 void STRING::add_str_double(const char* str, double number) {
395  if (str != NULL)
396  *this += str;
397  // Allow space for the maximum possible length of %8g.
398  char num_buffer[kMaxDoubleSize];
399  snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
400  num_buffer[kMaxDoubleSize - 1] = '\0';
401  *this += num_buffer;
402 }
403 
404 STRING & STRING::operator=(const char* cstr) {
405  STRING_HEADER* this_header = GetHeader();
406  if (cstr) {
407  int len = strlen(cstr) + 1;
408 
409  this_header->used_ = 0; // don't bother copying data if need to realloc
410  char* this_cstr = ensure_cstr(len);
411  this_header = GetHeader(); // for realloc
412  memcpy(this_cstr, cstr, len);
413  this_header->used_ = len;
414  } else {
415  // Reallocate to same state as default constructor.
416  DiscardData();
417  // Empty STRINGs contain just the "\0".
418  memcpy(AllocData(1, kMinCapacity), "", 1);
419  }
420 
421  assert(InvariantOk());
422  return *this;
423 }
424 
425 void STRING::assign(const char *cstr, int len) {
426  STRING_HEADER* this_header = GetHeader();
427  this_header->used_ = 0; // don't bother copying data if need to realloc
428  char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
429 
430  this_header = GetHeader(); // for realloc
431  memcpy(this_cstr, cstr, len);
432  this_cstr[len] = '\0';
433  this_header->used_ = len + 1;
434 
435  assert(InvariantOk());
436 }
437 
438 STRING STRING::operator+(const STRING& str) const {
439  STRING result(*this);
440  result += str;
441 
442  assert(InvariantOk());
443  return result;
444 }
445 
446 
447 STRING STRING::operator+(const char ch) const {
448  STRING result;
449  FixHeader();
450  const STRING_HEADER* this_header = GetHeader();
451  int this_used = this_header->used_;
452  char* result_cstr = result.ensure_cstr(this_used + 1);
453  STRING_HEADER* result_header = result.GetHeader();
454  int result_used = result_header->used_;
455 
456  // copies '\0' but we'll overwrite that
457  memcpy(result_cstr, GetCStr(), this_used);
458  result_cstr[result_used] = ch; // overwrite old '\0'
459  result_cstr[result_used + 1] = '\0'; // append on '\0'
460  ++result_header->used_;
461 
462  assert(InvariantOk());
463  return result;
464 }
465 
466 
467 STRING& STRING::operator+=(const char *str) {
468  if (!str || !*str) // empty string has no effect
469  return *this;
470 
471  FixHeader();
472  int len = strlen(str) + 1;
473  int this_used = GetHeader()->used_;
474  char* this_cstr = ensure_cstr(this_used + len);
475  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
476 
477  // if we had non-empty string then append overwriting old '\0'
478  // otherwise replace
479  if (this_used > 0) {
480  memcpy(this_cstr + this_used - 1, str, len);
481  this_header->used_ += len - 1;
482  } else {
483  memcpy(this_cstr, str, len);
484  this_header->used_ = len;
485  }
486 
487  assert(InvariantOk());
488  return *this;
489 }
490 
491 
492 STRING& STRING::operator+=(const char ch) {
493  if (ch == '\0')
494  return *this;
495 
496  FixHeader();
497  int this_used = GetHeader()->used_;
498  char* this_cstr = ensure_cstr(this_used + 1);
499  STRING_HEADER* this_header = GetHeader();
500 
501  if (this_used > 0)
502  --this_used; // undo old empty null if there was one
503 
504  this_cstr[this_used++] = ch; // append ch to end
505  this_cstr[this_used++] = '\0'; // append '\0' after ch
506  this_header->used_ = this_used;
507 
508  assert(InvariantOk());
509  return *this;
510 }
const int kMinCapacity
Definition: strngs.cpp:52
~STRING()
Definition: strngs.cpp:142
BOOL8 operator==(const STRING &string) const
Definition: strngs.cpp:308
const int kMaxDoubleSize
Definition: strngs.cpp:36
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:289
char * alloc_string(inT32 count)
Definition: memry.cpp:30
inT32 length() const
Definition: strngs.cpp:196
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:131
void free_string(char *string)
Definition: memry.cpp:35
STRING()
Definition: strngs.cpp:105
static bool SkipDeSerialize(bool swap, tesseract::TFile *fp)
Definition: strngs.cpp:185
const int kMaxIntSize
Definition: strngs.cpp:33
unsigned char BOOL8
Definition: host.h:46
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:163
void add_str_double(const char *str, double number)
Definition: strngs.cpp:394
int push_back(T object)
void add_str_int(const char *str, int number)
Definition: strngs.cpp:384
char & operator[](inT32 index) const
Definition: strngs.cpp:281
void truncate_at(inT32 index)
Definition: strngs.cpp:272
const char * string() const
Definition: strngs.cpp:201
STRING & operator=(const char *string)
Definition: strngs.cpp:404
BOOL8 operator!=(const STRING &string) const
Definition: strngs.cpp:320
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:91
void assign(const char *cstr, int len)
Definition: strngs.cpp:425
const char * c_str() const
Definition: strngs.cpp:212
STRING & operator+=(const char *string)
Definition: strngs.cpp:467
int inT32
Definition: host.h:35
Definition: strngs.h:44
BOOL8 contains(const char c) const
Definition: strngs.cpp:192
bool Serialize(FILE *fp) const
Definition: strngs.cpp:148
STRING operator+(const STRING &string) const
Definition: strngs.cpp:438
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
#define ASSERT_HOST(x)
Definition: errcode.h:84