tesseract  3.05.02
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <ctype.h>
27 #include <math.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 
37 #include "scanutils.h"
38 #include "tprintf.h"
39 
40 enum Flags {
41  FL_SPLAT = 0x01, // Drop the value, do not assign
42  FL_INV = 0x02, // Character-set with inverse
43  FL_WIDTH = 0x04, // Field width specified
44  FL_MINUS = 0x08, // Negative number
45 };
46 
47 enum Ranks {
48  RANK_CHAR = -2,
49  RANK_SHORT = -1,
50  RANK_INT = 0,
51  RANK_LONG = 1,
53  RANK_PTR = INT_MAX // Special value used for pointers
54 };
55 
56 const enum Ranks kMinRank = RANK_CHAR;
58 
60 const enum Ranks kSizeTRank = RANK_LONG;
62 
63 enum Bail {
64  BAIL_NONE = 0, // No error condition
65  BAIL_EOF, // Hit EOF
66  BAIL_ERR // Conversion mismatch
67 };
68 
69 // Helper functions ------------------------------------------------------------
70 inline size_t LongBit() {
71  return CHAR_BIT * sizeof(long);
72 }
73 
74 static inline int
75 SkipSpace(FILE *s) {
76  int p;
77  while (isspace(p = fgetc(s)));
78  ungetc(p, s); // Make sure next char is available for reading
79  return p;
80 }
81 
82 static inline void
83 SetBit(unsigned long *bitmap, unsigned int bit) {
84  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
85 }
86 
87 static inline int
88 TestBit(unsigned long *bitmap, unsigned int bit) {
89  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
90 }
91 
92 static inline int DigitValue(int ch, int base) {
93  if (ch >= '0' && ch <= '9') {
94  if (base >= 10 || ch <= '7')
95  return ch-'0';
96  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
97  return ch-'A'+10;
98  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
99  return ch-'a'+10;
100  }
101  return -1;
102 }
103 
104 // IO (re-)implementations -----------------------------------------------------
105 uintmax_t streamtoumax(FILE* s, int base) {
106  int minus = 0;
107  uintmax_t v = 0;
108  int d, c = 0;
109 
110  for (c = fgetc(s);
111  isspace(static_cast<unsigned char>(c)) && (c != EOF);
112  c = fgetc(s)) {}
113 
114  // Single optional + or -
115  if (c == '-' || c == '+') {
116  minus = (c == '-');
117  c = fgetc(s);
118  }
119 
120  // Assign correct base
121  if (base == 0) {
122  if (c == '0') {
123  c = fgetc(s);
124  if (c == 'x' || c == 'X') {
125  base = 16;
126  c = fgetc(s);
127  } else {
128  base = 8;
129  }
130  }
131  } else if (base == 16) {
132  if (c == '0') {
133  c = fgetc(s);
134  if (c == 'x' || c == 'X') c = fgetc(s);
135  }
136  }
137 
138  // Actual number parsing
139  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
140  v = v*base + d;
141 
142  ungetc(c, s);
143  return minus ? -v : v;
144 }
145 
146 double streamtofloat(FILE* s) {
147  int minus = 0;
148  int v = 0;
149  int d, c = 0;
150  int k = 1;
151  int w = 0;
152 
153  for (c = fgetc(s);
154  isspace(static_cast<unsigned char>(c)) && (c != EOF);
155  c = fgetc(s));
156 
157  // Single optional + or -
158  if (c == '-' || c == '+') {
159  minus = (c == '-');
160  c = fgetc(s);
161  }
162 
163  // Actual number parsing
164  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
165  v = v*10 + d;
166  if (c == '.') {
167  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
168  w = w*10 + d;
169  k *= 10;
170  }
171  }
172  double f = static_cast<double>(v)
173  + static_cast<double>(w) / static_cast<double>(k);
174  if (c == 'e' || c == 'E') {
175  c = fgetc(s);
176  int expsign = 1;
177  if (c == '-' || c == '+') {
178  expsign = (c == '-') ? -1 : 1;
179  c = fgetc(s);
180  }
181  int exponent = 0;
182  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
183  exponent = exponent * 10 + d;
184  }
185  exponent *= expsign;
186  f *= pow(10.0, static_cast<double>(exponent));
187  }
188  ungetc(c, s);
189 
190  return minus ? -f : f;
191 }
192 
193 double strtofloat(const char* s) {
194  int minus = 0;
195  int v = 0;
196  int d;
197  int k = 1;
198  int w = 0;
199 
200  while(*s && isspace(static_cast<unsigned char>(*s))) s++;
201 
202  // Single optional + or -
203  if (*s == '-' || *s == '+') {
204  minus = (*s == '-');
205  s++;
206  }
207 
208  // Actual number parsing
209  for (; *s && (d = DigitValue(*s, 10)) >= 0; s++)
210  v = v*10 + d;
211  if (*s == '.') {
212  for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) {
213  w = w*10 + d;
214  k *= 10;
215  }
216  }
217  if (*s == 'e' || *s == 'E')
218  tprintf("WARNING: Scientific Notation not supported!");
219 
220  double f = static_cast<double>(v)
221  + static_cast<double>(w) / static_cast<double>(k);
222 
223  return minus ? -f : f;
224 }
225 
226 static int tvfscanf(FILE* stream, const char *format, va_list ap);
227 
228 int tfscanf(FILE* stream, const char *format, ...) {
229  va_list ap;
230  int rv;
231 
232  va_start(ap, format);
233  rv = tvfscanf(stream, format, ap);
234  va_end(ap);
235 
236  return rv;
237 }
238 
239 #ifdef EMBEDDED
240 
241 int fscanf(FILE* stream, const char *format, ...) {
242  va_list ap;
243  int rv;
244 
245  va_start(ap, format);
246  rv = tvfscanf(stream, format, ap);
247  va_end(ap);
248 
249  return rv;
250 }
251 
252 int vfscanf(FILE* stream, const char *format, ...) {
253  va_list ap;
254  int rv;
255 
256  va_start(ap, format);
257  rv = tvfscanf(stream, format, ap);
258  va_end(ap);
259 
260  return rv;
261 }
262 #endif
263 
264 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
265  const char *p = format;
266  char ch;
267  int q = 0;
268  uintmax_t val = 0;
269  int rank = RANK_INT; // Default rank
270  unsigned int width = UINT_MAX;
271  int base;
272  int flags = 0;
273  enum {
274  ST_NORMAL, // Ground state
275  ST_FLAGS, // Special flags
276  ST_WIDTH, // Field width
277  ST_MODIFIERS, // Length or conversion modifiers
278  ST_MATCH_INIT, // Initial state of %[ sequence
279  ST_MATCH, // Main state of %[ sequence
280  ST_MATCH_RANGE, // After - in a %[ sequence
281  } state = ST_NORMAL;
282  char *sarg = NULL; // %s %c or %[ string argument
283  enum Bail bail = BAIL_NONE;
284  int sign;
285  int converted = 0; // Successful conversions
286  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
287  (CHAR_BIT * sizeof(long))];
288  int matchinv = 0; // Is match map inverted?
289  unsigned char range_start = 0;
290  off_t start_off = ftell(stream);
291 
292  // Skip leading spaces
293  SkipSpace(stream);
294 
295  while ((ch = *p++) && !bail) {
296  switch (state) {
297  case ST_NORMAL:
298  if (ch == '%') {
299  state = ST_FLAGS;
300  flags = 0; rank = RANK_INT; width = UINT_MAX;
301  } else if (isspace(static_cast<unsigned char>(ch))) {
302  SkipSpace(stream);
303  } else {
304  if (fgetc(stream) != ch)
305  bail = BAIL_ERR; // Match failure
306  }
307  break;
308 
309  case ST_FLAGS:
310  if (ch == '*') {
311  flags |= FL_SPLAT;
312  } else if ('0' <= ch && ch <= '9') {
313  width = (ch-'0');
314  state = ST_WIDTH;
315  flags |= FL_WIDTH;
316  } else {
317  state = ST_MODIFIERS;
318  p--; // Process this character again
319  }
320  break;
321 
322  case ST_WIDTH:
323  if (ch >= '0' && ch <= '9') {
324  width = width*10+(ch-'0');
325  } else {
326  state = ST_MODIFIERS;
327  p--; // Process this character again
328  }
329  break;
330 
331  case ST_MODIFIERS:
332  switch (ch) {
333  // Length modifiers - nonterminal sequences
334  case 'h':
335  rank--; // Shorter rank
336  break;
337  case 'l':
338  rank++; // Longer rank
339  break;
340  case 'j':
341  rank = kIntMaxRank;
342  break;
343  case 'z':
344  rank = kSizeTRank;
345  break;
346  case 't':
347  rank = kPtrDiffRank;
348  break;
349  case 'L':
350  case 'q':
351  rank = RANK_LONGLONG; // long double/long long
352  break;
353 
354  default:
355  // Output modifiers - terminal sequences
356  state = ST_NORMAL; // Next state will be normal
357  if (rank < kMinRank) // Canonicalize rank
358  rank = kMinRank;
359  else if (rank > kMaxRank)
360  rank = kMaxRank;
361 
362  switch (ch) {
363  case 'P': // Upper case pointer
364  case 'p': // Pointer
365  rank = RANK_PTR;
366  base = 0; sign = 0;
367  goto scan_int;
368 
369  case 'i': // Base-independent integer
370  base = 0; sign = 1;
371  goto scan_int;
372 
373  case 'd': // Decimal integer
374  base = 10; sign = 1;
375  goto scan_int;
376 
377  case 'o': // Octal integer
378  base = 8; sign = 0;
379  goto scan_int;
380 
381  case 'u': // Unsigned decimal integer
382  base = 10; sign = 0;
383  goto scan_int;
384 
385  case 'x': // Hexadecimal integer
386  case 'X':
387  base = 16; sign = 0;
388  goto scan_int;
389 
390  case 'n': // Number of characters consumed
391  val = ftell(stream) - start_off;
392  goto set_integer;
393 
394  scan_int:
395  q = SkipSpace(stream);
396  if ( q <= 0 ) {
397  bail = BAIL_EOF;
398  break;
399  }
400  val = streamtoumax(stream, base);
401  // fall through
402 
403  set_integer:
404  if (!(flags & FL_SPLAT)) {
405  converted++;
406  switch(rank) {
407  case RANK_CHAR:
408  *va_arg(ap, unsigned char *)
409  = static_cast<unsigned char>(val);
410  break;
411  case RANK_SHORT:
412  *va_arg(ap, unsigned short *)
413  = static_cast<unsigned short>(val);
414  break;
415  case RANK_INT:
416  *va_arg(ap, unsigned int *)
417  = static_cast<unsigned int>(val);
418  break;
419  case RANK_LONG:
420  *va_arg(ap, unsigned long *)
421  = static_cast<unsigned long>(val);
422  break;
423  case RANK_LONGLONG:
424  *va_arg(ap, unsigned long long *)
425  = static_cast<unsigned long long>(val);
426  break;
427  case RANK_PTR:
428  *va_arg(ap, void **)
429  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
430  break;
431  }
432  }
433  break;
434 
435  case 'f': // Preliminary float value parsing
436  case 'g':
437  case 'G':
438  case 'e':
439  case 'E':
440  q = SkipSpace(stream);
441  if (q <= 0) {
442  bail = BAIL_EOF;
443  break;
444  }
445 
446  {
447  double fval = streamtofloat(stream);
448  if (!(flags & FL_SPLAT)) {
449  if (rank == RANK_INT)
450  *va_arg(ap, float *) = static_cast<float>(fval);
451  else if (rank == RANK_LONG)
452  *va_arg(ap, double *) = static_cast<double>(fval);
453  converted++;
454  }
455  }
456  break;
457 
458  case 'c': // Character
459  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
460  sarg = va_arg(ap, char *);
461  while (width--) {
462  if ((q = fgetc(stream)) <= 0) {
463  bail = BAIL_EOF;
464  break;
465  }
466  if (!(flags & FL_SPLAT)) {
467  *sarg++ = q;
468  converted++;
469  }
470  }
471  break;
472 
473  case 's': // String
474  {
475  char *sp;
476  sp = sarg = va_arg(ap, char *);
477  while (width--) {
478  q = fgetc(stream);
479  if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
480  ungetc(q, stream);
481  break;
482  }
483  if (!(flags & FL_SPLAT)) *sp = q;
484  sp++;
485  }
486  if (sarg == sp) {
487  bail = BAIL_EOF;
488  } else if (!(flags & FL_SPLAT)) {
489  *sp = '\0'; // Terminate output
490  converted++;
491  } else {
492  }
493  }
494  break;
495 
496  case '[': // Character range
497  sarg = va_arg(ap, char *);
498  state = ST_MATCH_INIT;
499  matchinv = 0;
500  memset(matchmap, 0, sizeof matchmap);
501  break;
502 
503  case '%': // %% sequence
504  if (fgetc(stream) != '%' )
505  bail = BAIL_ERR;
506  break;
507 
508  default: // Anything else
509  bail = BAIL_ERR; // Unknown sequence
510  break;
511  }
512  }
513  break;
514 
515  case ST_MATCH_INIT: // Initial state for %[ match
516  if (ch == '^' && !(flags & FL_INV)) {
517  matchinv = 1;
518  } else {
519  SetBit(matchmap, static_cast<unsigned char>(ch));
520  state = ST_MATCH;
521  }
522  break;
523 
524  case ST_MATCH: // Main state for %[ match
525  if (ch == ']') {
526  goto match_run;
527  } else if (ch == '-') {
528  range_start = static_cast<unsigned char>(ch);
529  state = ST_MATCH_RANGE;
530  } else {
531  SetBit(matchmap, static_cast<unsigned char>(ch));
532  }
533  break;
534 
535  case ST_MATCH_RANGE: // %[ match after -
536  if (ch == ']') {
537  SetBit(matchmap, static_cast<unsigned char>('-'));
538  goto match_run;
539  } else {
540  int i;
541  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
542  SetBit(matchmap, i);
543  state = ST_MATCH;
544  }
545  break;
546 
547  match_run: // Match expression finished
548  char* oarg = sarg;
549  while (width) {
550  q = fgetc(stream);
551  unsigned char qc = static_cast<unsigned char>(q);
552  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
553  ungetc(q, stream);
554  break;
555  }
556  if (!(flags & FL_SPLAT)) *sarg = q;
557  sarg++;
558  }
559  if (oarg == sarg) {
560  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
561  } else if (!(flags & FL_SPLAT)) {
562  *sarg = '\0';
563  converted++;
564  }
565  break;
566  }
567  }
568 
569  if (bail == BAIL_EOF && !converted)
570  converted = -1; // Return EOF (-1)
571 
572  return converted;
573 }
574 
575 #ifdef EMBEDDED
576 int creat(const char *pathname, mode_t mode) {
577  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
578 }
579 
580 #endif // EMBEDDED
double strtofloat(const char *s)
Definition: scanutils.cpp:193
size_t LongBit()
Definition: scanutils.cpp:70
Bail
Definition: scanutils.cpp:63
Flags
Definition: scanutils.cpp:40
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:61
CMD_EVENTS mode
Definition: pgedit.cpp:116
uintmax_t streamtoumax(FILE *s, int base)
Definition: scanutils.cpp:105
enum Ranks kSizeTRank
Definition: scanutils.cpp:60
#define tprintf(...)
Definition: tprintf.h:31
enum Ranks kMinRank
Definition: scanutils.cpp:56
Ranks
Definition: scanutils.cpp:47
double streamtofloat(FILE *s)
Definition: scanutils.cpp:146
enum Ranks kIntMaxRank
Definition: scanutils.cpp:59
enum Ranks kMaxRank
Definition: scanutils.cpp:57