proxygen
String-inl.h
Go to the documentation of this file.
1 /*
2  * Copyright 2012-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <iterator>
20 #include <stdexcept>
21 
22 #include <folly/CppAttributes.h>
23 
24 #ifndef FOLLY_STRING_H_
25 #error This file may only be included from String.h
26 #endif
27 
28 namespace folly {
29 
30 namespace detail {
31 // Map from character code to value of one-character escape sequence
32 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33 // an octal escape sequence, or 'P' if the character is printable and
34 // should be printed as is.
35 extern const std::array<char, 256> cEscapeTable;
36 } // namespace detail
37 
38 template <class String>
39 void cEscape(StringPiece str, String& out) {
40  char esc[4];
41  esc[0] = '\\';
42  out.reserve(out.size() + str.size());
43  auto p = str.begin();
44  auto last = p; // last regular character
45  // We advance over runs of regular characters (printable, not double-quote or
46  // backslash) and copy them in one go; this is faster than calling push_back
47  // repeatedly.
48  while (p != str.end()) {
49  char c = *p;
50  unsigned char v = static_cast<unsigned char>(c);
51  char e = detail::cEscapeTable[v];
52  if (e == 'P') { // printable
53  ++p;
54  } else if (e == 'O') { // octal
55  out.append(&*last, size_t(p - last));
56  esc[1] = '0' + ((v >> 6) & 7);
57  esc[2] = '0' + ((v >> 3) & 7);
58  esc[3] = '0' + (v & 7);
59  out.append(esc, 4);
60  ++p;
61  last = p;
62  } else { // special 1-character escape
63  out.append(&*last, size_t(p - last));
64  esc[1] = e;
65  out.append(esc, 2);
66  ++p;
67  last = p;
68  }
69  }
70  out.append(&*last, size_t(p - last));
71 }
72 
73 namespace detail {
74 // Map from the character code of the character following a backslash to
75 // the unescaped character if a valid one-character escape sequence
76 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77 // octal escape sequence, 'X' if this is the first character of a
78 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79 extern const std::array<char, 256> cUnescapeTable;
80 
81 // Map from the character code to the hex value, or 16 if invalid hex char.
82 extern const std::array<unsigned char, 256> hexTable;
83 } // namespace detail
84 
85 template <class String>
86 void cUnescape(StringPiece str, String& out, bool strict) {
87  out.reserve(out.size() + str.size());
88  auto p = str.begin();
89  auto last = p; // last regular character (not part of an escape sequence)
90  // We advance over runs of regular characters (not backslash) and copy them
91  // in one go; this is faster than calling push_back repeatedly.
92  while (p != str.end()) {
93  char c = *p;
94  if (c != '\\') { // normal case
95  ++p;
96  continue;
97  }
98  out.append(&*last, p - last);
99  ++p;
100  if (p == str.end()) { // backslash at end of string
101  if (strict) {
102  throw std::invalid_argument("incomplete escape sequence");
103  }
104  out.push_back('\\');
105  last = p;
106  continue;
107  }
108  char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
109  if (e == 'O') { // octal
110  unsigned char val = 0;
111  for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
112  ++i, ++p) {
113  val <<= 3;
114  val |= (*p - '0');
115  }
116  out.push_back(val);
117  last = p;
118  } else if (e == 'X') { // hex
119  ++p;
120  if (p == str.end()) { // \x at end of string
121  if (strict) {
122  throw std::invalid_argument("incomplete hex escape sequence");
123  }
124  out.append("\\x");
125  last = p;
126  continue;
127  }
128  unsigned char val = 0;
129  unsigned char h;
130  for (; (p != str.end() &&
131  (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
132  ++p) {
133  val <<= 4;
134  val |= h;
135  }
136  out.push_back(val);
137  last = p;
138  } else if (e == 'I') { // invalid
139  if (strict) {
140  throw std::invalid_argument("invalid escape sequence");
141  }
142  out.push_back('\\');
143  out.push_back(*p);
144  ++p;
145  last = p;
146  } else { // standard escape sequence, \' etc
147  out.push_back(e);
148  ++p;
149  last = p;
150  }
151  }
152  out.append(&*last, p - last);
153 }
154 
155 namespace detail {
156 // Map from character code to escape mode:
157 // 0 = pass through
158 // 1 = unused
159 // 2 = pass through in PATH mode
160 // 3 = space, replace with '+' in QUERY mode
161 // 4 = percent-encode
162 extern const std::array<unsigned char, 256> uriEscapeTable;
163 } // namespace detail
164 
165 template <class String>
166 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
167  static const char hexValues[] = "0123456789abcdef";
168  char esc[3];
169  esc[0] = '%';
170  // Preallocate assuming that 25% of the input string will be escaped
171  out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
172  auto p = str.begin();
173  auto last = p; // last regular character
174  // We advance over runs of passthrough characters and copy them in one go;
175  // this is faster than calling push_back repeatedly.
176  unsigned char minEncode = static_cast<unsigned char>(mode);
177  while (p != str.end()) {
178  char c = *p;
179  unsigned char v = static_cast<unsigned char>(c);
180  unsigned char discriminator = detail::uriEscapeTable[v];
181  if (LIKELY(discriminator <= minEncode)) {
182  ++p;
183  } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
184  out.append(&*last, size_t(p - last));
185  out.push_back('+');
186  ++p;
187  last = p;
188  } else {
189  out.append(&*last, size_t(p - last));
190  esc[1] = hexValues[v >> 4];
191  esc[2] = hexValues[v & 0x0f];
192  out.append(esc, 3);
193  ++p;
194  last = p;
195  }
196  }
197  out.append(&*last, size_t(p - last));
198 }
199 
200 template <class String>
201 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
202  out.reserve(out.size() + str.size());
203  auto p = str.begin();
204  auto last = p;
205  // We advance over runs of passthrough characters and copy them in one go;
206  // this is faster than calling push_back repeatedly.
207  while (p != str.end()) {
208  char c = *p;
209  switch (c) {
210  case '%': {
211  if (UNLIKELY(std::distance(p, str.end()) < 3)) {
212  throw std::invalid_argument("incomplete percent encode sequence");
213  }
214  auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
215  auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
216  if (UNLIKELY(h1 == 16 || h2 == 16)) {
217  throw std::invalid_argument("invalid percent encode sequence");
218  }
219  out.append(&*last, size_t(p - last));
220  out.push_back((h1 << 4) | h2);
221  p += 3;
222  last = p;
223  break;
224  }
225  case '+':
226  if (mode == UriEscapeMode::QUERY) {
227  out.append(&*last, size_t(p - last));
228  out.push_back(' ');
229  ++p;
230  last = p;
231  break;
232  }
233  // else fallthrough
235  default:
236  ++p;
237  break;
238  }
239  }
240  out.append(&*last, size_t(p - last));
241 }
242 
243 namespace detail {
244 
245 /*
246  * The following functions are type-overloaded helpers for
247  * internalSplit().
248  */
249 inline size_t delimSize(char) {
250  return 1;
251 }
252 inline size_t delimSize(StringPiece s) {
253  return s.size();
254 }
255 inline bool atDelim(const char* s, char c) {
256  return *s == c;
257 }
258 inline bool atDelim(const char* s, StringPiece sp) {
259  return !std::memcmp(s, sp.start(), sp.size());
260 }
261 
262 // These are used to short-circuit internalSplit() in the case of
263 // 1-character strings.
264 inline char delimFront(char c) {
265  // This one exists only for compile-time; it should never be called.
266  std::abort();
267  return c;
268 }
269 inline char delimFront(StringPiece s) {
270  assert(!s.empty() && s.start() != nullptr);
271  return *s.start();
272 }
273 
274 /*
275  * Shared implementation for all the split() overloads.
276  *
277  * This uses some external helpers that are overloaded to let this
278  * algorithm be more performant if the deliminator is a single
279  * character instead of a whole string.
280  *
281  * @param ignoreEmpty iff true, don't copy empty segments to output
282  */
283 template <class OutStringT, class DelimT, class OutputIterator>
285  DelimT delim,
286  StringPiece sp,
287  OutputIterator out,
288  bool ignoreEmpty) {
289  assert(sp.empty() || sp.start() != nullptr);
290 
291  const char* s = sp.start();
292  const size_t strSize = sp.size();
293  const size_t dSize = delimSize(delim);
294 
295  if (dSize > strSize || dSize == 0) {
296  if (!ignoreEmpty || strSize > 0) {
297  *out++ = to<OutStringT>(sp);
298  }
299  return;
300  }
301  if (std::is_same<DelimT, StringPiece>::value && dSize == 1) {
302  // Call the char version because it is significantly faster.
303  return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
304  }
305 
306  size_t tokenStartPos = 0;
307  size_t tokenSize = 0;
308  for (size_t i = 0; i <= strSize - dSize; ++i) {
309  if (atDelim(&s[i], delim)) {
310  if (!ignoreEmpty || tokenSize > 0) {
311  *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
312  }
313 
314  tokenStartPos = i + dSize;
315  tokenSize = 0;
316  i += dSize - 1;
317  } else {
318  ++tokenSize;
319  }
320  }
321  tokenSize = strSize - tokenStartPos;
322  if (!ignoreEmpty || tokenSize > 0) {
323  *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
324  }
325 }
326 
327 template <class String>
328 StringPiece prepareDelim(const String& s) {
329  return StringPiece(s);
330 }
331 inline char prepareDelim(char c) {
332  return c;
333 }
334 
335 template <class OutputType>
337  output = folly::to<OutputType>(input);
338 }
339 
340 inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
341 
342 template <bool exact, class Delim, class OutputType>
343 bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
344  static_assert(
347  std::is_same<OutputType, decltype(std::ignore)>::value,
348  "split<false>() requires that the last argument be a string type "
349  "or std::ignore");
350  if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
351  return false;
352  }
353  toOrIgnore(input, output);
354  return true;
355 }
356 
357 template <bool exact, class Delim, class OutputType, class... OutputTypes>
359  const Delim& delimiter,
360  StringPiece input,
361  OutputType& outHead,
362  OutputTypes&... outTail) {
363  size_t cut = input.find(delimiter);
364  if (UNLIKELY(cut == std::string::npos)) {
365  return false;
366  }
367  StringPiece head(input.begin(), input.begin() + cut);
368  StringPiece tail(
369  input.begin() + cut + detail::delimSize(delimiter), input.end());
370  if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
371  toOrIgnore(head, outHead);
372  return true;
373  }
374  return false;
375 }
376 
377 } // namespace detail
378 
380 
381 template <class Delim, class String, class OutputType>
382 void split(
383  const Delim& delimiter,
384  const String& input,
385  std::vector<OutputType>& out,
386  bool ignoreEmpty) {
387  detail::internalSplit<OutputType>(
388  detail::prepareDelim(delimiter),
389  StringPiece(input),
390  std::back_inserter(out),
391  ignoreEmpty);
392 }
393 
394 template <class Delim, class String, class OutputType>
395 void split(
396  const Delim& delimiter,
397  const String& input,
399  bool ignoreEmpty) {
400  detail::internalSplit<OutputType>(
401  detail::prepareDelim(delimiter),
402  StringPiece(input),
403  std::back_inserter(out),
404  ignoreEmpty);
405 }
406 
407 template <
408  class OutputValueType,
409  class Delim,
410  class String,
411  class OutputIterator>
412 void splitTo(
413  const Delim& delimiter,
414  const String& input,
415  OutputIterator out,
416  bool ignoreEmpty) {
417  detail::internalSplit<OutputValueType>(
418  detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
419 }
420 
421 template <bool exact, class Delim, class... OutputTypes>
422 typename std::enable_if<
424  sizeof...(OutputTypes) >= 1,
425  bool>::type
426 split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
427  return detail::splitFixed<exact>(
428  detail::prepareDelim(delimiter), input, outputs...);
429 }
430 
431 namespace detail {
432 
433 /*
434  * If a type can have its string size determined cheaply, we can more
435  * efficiently append it in a loop (see internalJoinAppend). Note that the
436  * struct need not conform to the std::string api completely (ex. does not need
437  * to implement append()).
438  */
439 template <class T>
441  enum {
443  };
444 };
445 
446 template <class Iterator>
448  : IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
449 
450 template <class Delim, class Iterator, class String>
452  Delim delimiter,
453  Iterator begin,
454  Iterator end,
455  String& output) {
456  assert(begin != end);
457  if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) {
458  internalJoinAppend(delimFront(delimiter), begin, end, output);
459  return;
460  }
461  toAppend(*begin, &output);
462  while (++begin != end) {
463  toAppend(delimiter, *begin, &output);
464  }
465 }
466 
467 template <class Delim, class Iterator, class String>
469 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
470  output.clear();
471  if (begin == end) {
472  return;
473  }
474  const size_t dsize = delimSize(delimiter);
475  Iterator it = begin;
476  size_t size = it->size();
477  while (++it != end) {
478  size += dsize + it->size();
479  }
480  output.reserve(size);
481  internalJoinAppend(delimiter, begin, end, output);
482 }
483 
484 template <class Delim, class Iterator, class String>
485 typename std::enable_if<
487 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
488  output.clear();
489  if (begin == end) {
490  return;
491  }
492  internalJoinAppend(delimiter, begin, end, output);
493 }
494 
495 } // namespace detail
496 
497 template <class Delim, class Iterator, class String>
498 void join(
499  const Delim& delimiter,
500  Iterator begin,
501  Iterator end,
502  String& output) {
503  detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
504 }
505 
506 template <class OutputString>
508  folly::StringPiece input,
509  OutputString& output,
510  bool hex_style) {
511  static const char hexValues[] = "0123456789abcdef";
512  output.clear();
513  output.reserve(3 * input.size());
514  for (unsigned char c : input) {
515  // less than space or greater than '~' are considered unprintable
516  if (c < 0x20 || c > 0x7e || c == '\\') {
517  bool hex_append = false;
518  output.push_back('\\');
519  if (hex_style) {
520  hex_append = true;
521  } else {
522  if (c == '\r') {
523  output += 'r';
524  } else if (c == '\n') {
525  output += 'n';
526  } else if (c == '\t') {
527  output += 't';
528  } else if (c == '\a') {
529  output += 'a';
530  } else if (c == '\b') {
531  output += 'b';
532  } else if (c == '\0') {
533  output += '0';
534  } else if (c == '\\') {
535  output += '\\';
536  } else {
537  hex_append = true;
538  }
539  }
540  if (hex_append) {
541  output.push_back('x');
542  output.push_back(hexValues[(c >> 4) & 0xf]);
543  output.push_back(hexValues[c & 0xf]);
544  }
545  } else {
546  output += c;
547  }
548  }
549 }
550 
551 template <class String1, class String2>
552 void humanify(const String1& input, String2& output) {
553  size_t numUnprintable = 0;
554  size_t numPrintablePrefix = 0;
555  for (unsigned char c : input) {
556  if (c < 0x20 || c > 0x7e || c == '\\') {
557  ++numUnprintable;
558  }
559  if (numUnprintable == 0) {
560  ++numPrintablePrefix;
561  }
562  }
563 
564  // hexlify doubles a string's size; backslashify can potentially
565  // explode it by 4x. Now, the printable range of the ascii
566  // "spectrum" is around 95 out of 256 values, so a "random" binary
567  // string should be around 60% unprintable. We use a 50% hueristic
568  // here, so if a string is 60% unprintable, then we just use hex
569  // output. Otherwise we backslash.
570  //
571  // UTF8 is completely ignored; as a result, utf8 characters will
572  // likely be \x escaped (since most common glyphs fit in two bytes).
573  // This is a tradeoff of complexity/speed instead of a convenience
574  // that likely would rarely matter. Moreover, this function is more
575  // about displaying underlying bytes, not about displaying glyphs
576  // from languages.
577  if (numUnprintable == 0) {
578  output = input;
579  } else if (5 * numUnprintable >= 3 * input.size()) {
580  // However! If we have a "meaningful" prefix of printable
581  // characters, say 20% of the string, we backslashify under the
582  // assumption viewing the prefix as ascii is worth blowing the
583  // output size up a bit.
584  if (5 * numPrintablePrefix >= input.size()) {
585  backslashify(input, output);
586  } else {
587  output = "0x";
588  hexlify(input, output, true /* append output */);
589  }
590  } else {
591  backslashify(input, output);
592  }
593 }
594 
595 template <class InputString, class OutputString>
596 bool hexlify(
597  const InputString& input,
598  OutputString& output,
599  bool append_output) {
600  if (!append_output) {
601  output.clear();
602  }
603 
604  static char hexValues[] = "0123456789abcdef";
605  auto j = output.size();
606  output.resize(2 * input.size() + output.size());
607  for (size_t i = 0; i < input.size(); ++i) {
608  int ch = input[i];
609  output[j++] = hexValues[(ch >> 4) & 0xf];
610  output[j++] = hexValues[ch & 0xf];
611  }
612  return true;
613 }
614 
615 template <class InputString, class OutputString>
616 bool unhexlify(const InputString& input, OutputString& output) {
617  if (input.size() % 2 != 0) {
618  return false;
619  }
620  output.resize(input.size() / 2);
621  int j = 0;
622 
623  for (size_t i = 0; i < input.size(); i += 2) {
624  int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
625  int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
626  if ((highBits | lowBits) & 0x10) {
627  // One of the characters wasn't a hex digit
628  return false;
629  }
630  output[j++] = (highBits << 4) + lowBits;
631  }
632  return true;
633 }
634 
635 namespace detail {
640 size_t
641 hexDumpLine(const void* ptr, size_t offset, size_t size, std::string& line);
642 } // namespace detail
643 
644 template <class OutIt>
645 void hexDump(const void* ptr, size_t size, OutIt out) {
646  size_t offset = 0;
647  std::string line;
648  while (offset < size) {
649  offset += detail::hexDumpLine(ptr, offset, size, line);
650  *out++ = line;
651  }
652 }
653 
654 } // namespace folly
bool atDelim(const char *s, char c)
Definition: String-inl.h:255
StringPiece prepareDelim(const String &s)
Definition: String-inl.h:328
void * ptr
void splitTo(const Delim &delimiter, const String &input, OutputIterator out, bool ignoreEmpty)
Definition: String-inl.h:412
bool unhexlify(const InputString &input, OutputString &output)
Definition: String-inl.h:616
*than *hazptr_holder h
Definition: Hazptr.h:116
const std::array< unsigned char, 256 > uriEscapeTable
Definition: String.cpp:122
auto v
UriEscapeMode
Definition: String.h:121
const std::array< char, 256 > cEscapeTable
Definition: String.cpp:116
void internalJoinAppend(Delim delimiter, Iterator begin, Iterator end, String &output)
Definition: String-inl.h:451
void uriUnescape(StringPiece str, String &out, UriEscapeMode mode)
Definition: String-inl.h:201
PskType type
size_type find(const_range_type str) const
Definition: Range.h:721
#define LIKELY(x)
Definition: Likely.h:47
constexpr size_type size() const
Definition: Range.h:431
auto begin(TestAdlIterable &instance)
Definition: ForeachTest.cpp:56
double val
Definition: String.cpp:273
void toOrIgnore(StringPiece input, OutputType &output)
Definition: String-inl.h:336
void humanify(const String1 &input, String2 &output)
Definition: String-inl.h:552
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
constexpr Iter start() const
Definition: Range.h:449
void split(const Delim &delimiter, const String &input, std::vector< OutputType > &out, bool ignoreEmpty)
Definition: String-inl.h:382
auto ch
folly::Optional< PskKeyExchangeMode > mode
const std::array< unsigned char, 256 > hexTable
Definition: String.cpp:120
constexpr auto size(C const &c) -> decltype(c.size())
Definition: Access.h:45
constexpr bool empty() const
Definition: Range.h:443
auto end(TestAdlIterable &instance)
Definition: ForeachTest.cpp:62
void uriEscape(StringPiece str, String &out, UriEscapeMode mode)
Definition: String-inl.h:166
size_t delimSize(char)
Definition: String-inl.h:249
size_t hexDumpLine(const void *ptr, size_t offset, size_t size, std::string &line)
Definition: String.cpp:651
Range subpiece(size_type first, size_type length=npos) const
Definition: Range.h:686
void cUnescape(StringPiece str, String &out, bool strict)
Definition: String-inl.h:86
void hexDump(const void *ptr, size_t size, OutIt out)
Definition: String-inl.h:645
const std::array< char, 256 > cUnescapeTable
Definition: String.cpp:118
static const char *const value
Definition: Conv.cpp:50
void toAppend(char value, Tgt *result)
Definition: Conv.h:406
constexpr Iter end() const
Definition: Range.h:455
std::enable_if< IsSizableStringContainerIterator< Iterator >::value >::type internalJoin(Delim delimiter, Iterator begin, Iterator end, String &output)
Definition: String-inl.h:469
constexpr Iter begin() const
Definition: Range.h:452
void backslashify(folly::StringPiece input, OutputString &output, bool hex_style)
Definition: String-inl.h:507
const char * string
Definition: Conv.cpp:212
void internalSplit(DelimT delim, StringPiece sp, OutputIterator out, bool ignoreEmpty)
Definition: String-inl.h:284
static set< string > s
uint64_t value(const typename LockFreeRingBuffer< T, Atom >::Cursor &rbcursor)
void join(const Delim &delimiter, Iterator begin, Iterator end, String &output)
Definition: String-inl.h:498
bool splitFixed(const Delim &delimiter, StringPiece input, OutputType &output)
Definition: String-inl.h:343
Range< const char * > StringPiece
bool hexlify(const InputString &input, OutputString &output, bool append_output)
Definition: String-inl.h:596
#define UNLIKELY(x)
Definition: Likely.h:48
char c
char delimFront(char c)
Definition: String-inl.h:264
#define FOLLY_FALLTHROUGH
Definition: CppAttributes.h:63
void cEscape(StringPiece str, String &out)
Definition: String-inl.h:39