proxygen
http_parser.c
Go to the documentation of this file.
1 /* Based on src/http/ngx_http_parse.c from NGINX copyright Igor Sysoev
2  *
3  * Additional changes are licensed under the same terms as NGINX and
4  * copyright Joyent, Inc. and other Node contributors. All rights reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
25 
26 #include <assert.h>
27 #include <stddef.h>
28 #include <limits.h>
29 #include <stdlib.h>
30 
31 #if __cplusplus
32 #include <limits>
33 
34 namespace proxygen {
35 
36 #ifndef INT64_MAX
37 # define INT64_MAX std::numeric_limits<int64_t>::max()
38 #endif
39 
40 #else
41 #define nullptr NULL
42 
43 #endif /* __cplusplus */
44 
45 #ifndef MIN
46 # define MIN(a,b) ((a) < (b) ? (a) : (b))
47 #endif
48 
49 
50 #if HTTP_PARSER_DEBUG
51 #define SET_ERRNO(e) \
52 do { \
53  parser->http_errno = (e); \
54  parser->error_lineno = __LINE__; \
55 } while (0)
56 #else
57 #define SET_ERRNO(e) \
58 do { \
59  parser->http_errno = (e); \
60 } while(0)
61 #endif
62 
63 #define RETURN(r) \
64 do { \
65  parser->state = state; \
66  return (r); \
67 } while(0)
68 
69 /* Run the notify callback FOR, returning ER if it fails */
70 #define _CALLBACK_NOTIFY(FOR, ER) \
71 do { \
72  parser->state = state; \
73  assert(HTTP_PARSER_ERRNO(parser) == HPE_OK); \
74  \
75  if (0 != settings->on_##FOR(parser)) { \
76  SET_ERRNO(HPE_CB_##FOR); \
77  } \
78  \
79  /* We either errored above or got paused; get out */ \
80  if (HTTP_PARSER_ERRNO(parser) != HPE_OK) { \
81  return (ER); \
82  } \
83 } while (0)
84 
85 /* Run the notify callback FOR and consume the current byte */
86 #define CALLBACK_NOTIFY(FOR) _CALLBACK_NOTIFY(FOR, p - data + 1)
87 
88 /* Run the notify callback FOR and don't consume the current byte */
89 #define CALLBACK_NOTIFY_NOADVANCE(FOR) _CALLBACK_NOTIFY(FOR, p - data)
90 
91 /* Run data callback FOR with LEN bytes, returning ER if it fails */
92 #define _CALLBACK_DATA(FOR, LEN, ER) \
93 do { \
94  parser->state = state; \
95  assert(HTTP_PARSER_ERRNO(parser) == HPE_OK); \
96  \
97  if (FOR##_mark) { \
98  if (0 != settings->on_##FOR(parser, FOR##_mark, (LEN))) { \
99  SET_ERRNO(HPE_CB_##FOR); \
100  } \
101  \
102  /* We either errored above or got paused; get out */ \
103  if (HTTP_PARSER_ERRNO(parser) != HPE_OK) { \
104  return (ER); \
105  } \
106  FOR##_mark = nullptr; \
107  } \
108 } while (0)
109 
110 /* Run the data callback FOR and consume the current byte */
111 #define CALLBACK_DATA(FOR) \
112  _CALLBACK_DATA(FOR, p - FOR##_mark, p - data + 1)
113 
114 /* Run the data callback FOR and don't consume the current byte */
115 #define CALLBACK_DATA_NOADVANCE(FOR) \
116  _CALLBACK_DATA(FOR, p - FOR##_mark, p - data)
117 
118 /* We just saw a synthetic space */
119 #define CALLBACK_SPACE(FOR) \
120 do { \
121  parser->state = state; \
122  if (0 != settings->on_##FOR(parser, SPACE, 1)) { \
123  SET_ERRNO(HPE_CB_##FOR); \
124  return (p - data); \
125  } \
126  \
127  /* We either errored above or got paused; get out */ \
128  if (HTTP_PARSER_ERRNO(parser) != HPE_OK) { \
129  return (p - data); \
130  } \
131 } while (0)
132 
133 /* Set the mark FOR; non-destructive if mark is already set */
134 #define MARK(FOR) \
135 do { \
136  if (!FOR##_mark) { \
137  FOR##_mark = p; \
138  } \
139 } while (0)
140 
141 
142 #define CONTENT_LENGTH "content-length"
143 #define TRANSFER_ENCODING "transfer-encoding"
144 #define UPGRADE "upgrade"
145 #define CHUNKED "chunked"
146 #define SPACE " "
147 
148 
149 static const char *method_strings[] =
150  { "DELETE"
151  , "GET"
152  , "HEAD"
153  , "POST"
154  , "PUT"
155  , "CONNECT"
156  , "OPTIONS"
157  , "TRACE"
158  , "COPY"
159  , "LOCK"
160  , "MKCOL"
161  , "MOVE"
162  , "PROPFIND"
163  , "PROPPATCH"
164  , "UNLOCK"
165  , "REPORT"
166  , "MKACTIVITY"
167  , "CHECKOUT"
168  , "MERGE"
169  , "M-SEARCH"
170  , "NOTIFY"
171  , "SUBSCRIBE"
172  , "UNSUBSCRIBE"
173  , "PATCH"
174  };
175 
176 
177 /* Tokens as defined by rfc 2616. Also lowercases them.
178  * token = 1*<any CHAR except CTLs or separators>
179  * separators = "(" | ")" | "<" | ">" | "@"
180  * | "," | ";" | ":" | "\" | <">
181  * | "/" | "[" | "]" | "?" | "="
182  * | "{" | "}" | SP | HT
183  */
184 static const char tokens[256] = {
185 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */
186  0, 0, 0, 0, 0, 0, 0, 0,
187 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */
188  0, 0, 0, 0, 0, 0, 0, 0,
189 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
190  0, 0, 0, 0, 0, 0, 0, 0,
191 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */
192  0, 0, 0, 0, 0, 0, 0, 0,
193 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */
194  0, '!', 0, '#', '$', '%', '&', '\'',
195 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */
196  0, 0, '*', '+', 0, '-', '.', 0,
197 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */
198  '0', '1', '2', '3', '4', '5', '6', '7',
199 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */
200  '8', '9', 0, 0, 0, 0, 0, 0,
201 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */
202  0, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
203 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */
204  'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
205 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */
206  'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
207 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */
208  'x', 'y', 'z', 0, 0, 0, '^', '_',
209 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */
210  '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
211 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */
212  'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
213 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */
214  'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
215 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */
216  'x', 'y', 'z', 0, '|', 0, '~', 0 };
217 
218 
219 static const int8_t unhex[256] =
220  {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
221  ,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
222  ,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
223  , 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1
224  ,-1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1
225  ,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
226  ,-1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1
227  ,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
228  };
229 
230 #if HTTP_PARSER_STRICT
231 # define T(v) 0
232 #else
233 # define T(v) v
234 #endif
235 
236 static const uint8_t normal_url_char[256] = {
237 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */
238  0, 0, 0, 0, 0, 0, 0, 0,
239 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */
240  0, T(1), 0, 0, T(1), 0, 0, 0,
241 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
242  0, 0, 0, 0, 0, 0, 0, 0,
243 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */
244  0, 0, 0, 0, 0, 0, 0, 0,
245 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */
246  0, 1, 1, 0, 1, 1, 1, 1,
247 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */
248  1, 1, 1, 1, 1, 1, 1, 1,
249 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */
250  1, 1, 1, 1, 1, 1, 1, 1,
251 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */
252  1, 1, 1, 1, 1, 1, 1, 0,
253 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */
254  1, 1, 1, 1, 1, 1, 1, 1,
255 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */
256  1, 1, 1, 1, 1, 1, 1, 1,
257 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */
258  1, 1, 1, 1, 1, 1, 1, 1,
259 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */
260  1, 1, 1, 1, 1, 1, 1, 1,
261 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */
262  1, 1, 1, 1, 1, 1, 1, 1,
263 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */
264  1, 1, 1, 1, 1, 1, 1, 1,
265 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */
266  1, 1, 1, 1, 1, 1, 1, 1,
267 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */
268  1, 1, 1, 1, 1, 1, 1, 0, };
269 
270 #undef T
271 
272 enum state
273  { s_dead = 1 /* important that this is > 0 */
277 
293 
324 
330 
332 
337 
340 
341  /* Important: 's_headers_done' must be the last 'header' state. All
342  * states beyond this must be 'body' states. It is used for overflow
343  * checking. See the PARSING_HEADER() macro.
344  */
345 
349 
352 
354  };
355 
356 
357 #define PARSING_HEADER(state) (state <= s_headers_done)
358 
359 
361  { h_general = 0
362 
365 
369 
373 
375 
377  };
378 
380  {
391 };
392 
393 
394 /* Macros for character classes; depends on strict-mode */
395 #define CR '\r'
396 #define LF '\n'
397 #define QT '"'
398 #define BS '\\'
399 #define LOWER(c) (unsigned char)(c | 0x20)
400 #define TOKEN(c) (tokens[(unsigned char)c])
401 #define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z')
402 #define IS_NUM(c) ((c) >= '0' && (c) <= '9')
403 #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
404 #define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
405 #define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
406  (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \
407  (c) == ')')
408 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
409  (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
410  (c) == '$' || (c) == ',')
411 
412 #if HTTP_PARSER_STRICT
413 #define IS_URL_CHAR(c) (normal_url_char[(unsigned char) (c)])
414 #define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
415 #else
416 #define IS_URL_CHAR(c) \
417  (normal_url_char[(unsigned char) (c)] || ((c) & 0x80))
418 #define IS_HOST_CHAR(c) \
419  (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
420 #endif
421 
426 #define IS_HEADER_CHAR(ch) \
427  (ch == CR || ch == LF || ch == 9 || ((unsigned char)ch > 31 && ch != 127))
428 
429 #define start_state (parser->type == HTTP_REQUEST ? s_pre_start_req : s_pre_start_res)
430 
431 #define STRICT_CHECK(cond)
432 #define NEW_MESSAGE() start_state
433 
434 /* Map errno values to strings for human-readable output */
435 #define HTTP_STRERROR_GEN(n, s) { "HPE_" #n, s },
436 static struct {
437  const char *name;
438  const char *description;
439 } http_strerror_tab[] = {
441 };
442 #undef HTTP_STRERROR_GEN
443 
444 /* Our URL parser.
445  *
446  * This is designed to be shared by http_parser_execute() for URL validation,
447  * hence it has a state transition + byte-for-byte interface. In addition, it
448  * is meant to be embedded in http_parser_parse_url(), which does the dirty
449  * work of turning state transitions URL components for its API.
450  *
451  * This function should only be invoked with non-space characters. It is
452  * assumed that the caller cares about (and can detect) the transition between
453  * URL and non-URL states by looking for these.
454  */
455 static enum state
456 parse_url_char(enum state s, const char ch)
457 {
458  if (ch == ' ' || ch == '\r' || ch == '\n') {
459  return s_dead;
460  }
461 
462 #if HTTP_PARSER_STRICT
463  if (ch == '\t' || ch == '\f') {
464  return s_dead;
465  }
466 #endif
467 
468  switch (s) {
470  /* Proxied requests are followed by scheme of an absolute URI (alpha).
471  * All methods except CONNECT are followed by '/' or '*'.
472  */
473 
474  if (ch == '/' || ch == '*') {
475  return s_req_path;
476  }
477 
478  if (IS_ALPHA(ch)) {
479  return s_req_schema;
480  }
481 
482  break;
483 
484  case s_req_schema:
485  if (IS_ALPHA(ch)) {
486  return s;
487  }
488 
489  if (ch == ':') {
490  return s_req_schema_slash;
491  }
492 
493  break;
494 
495  case s_req_schema_slash:
496  if (ch == '/') {
498  }
499 
500  break;
501 
503  if (ch == '/') {
504  return s_req_server_start;
505  }
506 
507  break;
508 
510  if (ch == '@') {
511  return s_dead;
512  }
513 
514  /* FALLTHROUGH */
515  case s_req_server_start:
516  case s_req_server:
517  if (ch == '/') {
518  return s_req_path;
519  }
520 
521  if (ch == '?') {
523  }
524 
525  if (ch == '@') {
526  return s_req_server_with_at;
527  }
528 
529  if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
530  return s_req_server;
531  }
532 
533  break;
534 
535  case s_req_path:
536  if (IS_URL_CHAR(ch)) {
537  return s;
538  }
539 
540  switch (ch) {
541  case '?':
543 
544  case '#':
545  return s_req_fragment_start;
546  }
547 
548  break;
549 
551  case s_req_query_string:
552  if (IS_URL_CHAR(ch)) {
553  return s_req_query_string;
554  }
555 
556  switch (ch) {
557  case '?':
558  /* allow extra '?' in query string */
559  return s_req_query_string;
560 
561  case '#':
562  return s_req_fragment_start;
563  }
564 
565  break;
566 
568  if (IS_URL_CHAR(ch)) {
569  return s_req_fragment;
570  }
571 
572  switch (ch) {
573  case '?':
574  return s_req_fragment;
575 
576  case '#':
577  return s;
578  }
579 
580  break;
581 
582  case s_req_fragment:
583  if (IS_URL_CHAR(ch)) {
584  return s;
585  }
586 
587  switch (ch) {
588  case '?':
589  case '#':
590  return s;
591  }
592 
593  break;
594 
595  default:
596  break;
597  }
598 
599  /* We should never fall out of the switch above unless there's an error */
600  return s_dead;
601 }
605  const char *data,
606  size_t len)
607 {
608  char c, ch;
609  int8_t unhex_val;
610  const char *p = data;
611 
612  /* Optimization: within the parsing loop below, we refer to this
613  * local copy of the state rather than parser->state. The compiler
614  * can't be sure whether parser->state will change during a callback,
615  * so it generates a lot of memory loads and stores to keep a register
616  * copy of the state in sync with the memory copy. We know, however,
617  * that the callbacks aren't allowed to change the parser state, so
618  * the parsing loop works with this local variable and only copies
619  * the value back to parser->loop before returning or invoking a
620  * callback.
621  */
622  unsigned char state = parser->state;
623  const unsigned int lenient = 0;
624 
625  /* We're in an error state. Don't bother doing anything. */
626  if (HTTP_PARSER_ERRNO(parser) != HPE_OK) {
627  RETURN(0);
628  }
629 
630  if (len == 0) {
631  switch (state) {
632  case s_body_identity_eof:
633  /* Use of CALLBACK_NOTIFY() here would erroneously return 1 byte read if
634  * we got paused.
635  */
636  CALLBACK_NOTIFY_NOADVANCE(message_complete);
637  RETURN(0);
638 
640  case s_pre_start_res:
641  case s_pre_start_req:
642  RETURN(0);
643 
644  default:
646  RETURN(1);
647  }
648  }
649 
650  /* technically we could combine all of these (except for url_mark) into one
651  variable, saving stack space, but it seems more clear to have them
652  separated. */
653  const char *header_field_mark = 0;
654  const char *header_value_mark = 0;
655  const char *url_mark = 0;
656  const char *reason_mark = 0;
657  const char *body_mark = 0;
658 
659  if (state == s_header_field)
660  header_field_mark = data;
661  if (state == s_header_value)
662  header_value_mark = data;
663  if (state == s_req_path ||
664  state == s_req_schema ||
665  state == s_req_schema_slash ||
666  state == s_req_schema_slash_slash ||
667  state == s_req_port ||
668  state == s_req_query_string_start ||
669  state == s_req_query_string ||
670  state == s_req_host_start ||
671  state == s_req_host ||
672  state == s_req_host_ipv6 ||
673  state == s_req_host_done ||
674  state == s_req_fragment_start ||
675  state == s_req_fragment)
676  url_mark = data;
677  if (state == s_res_status)
678  reason_mark = data;
679 
680  /* Used only for overflow checking. If the parser is in a parsing-headers
681  * state, then its value is equal to max(data, the beginning of the current
682  * message or chunk). If the parser is in a not-parsing-headers state, then
683  * its value is irrelevant.
684  */
685  const char* data_or_header_data_start = data;
686 
687  for (p = data; p != data + len; p++) {
688  ch = *p;
689 
690  reexecute_byte:
691  switch (state) {
692 
694  if (ch == CR || ch == LF)
695  break;
696  state = s_start_req_or_res;
697  CALLBACK_NOTIFY_NOADVANCE(message_begin);
698  goto reexecute_byte;
699 
700  case s_start_req_or_res:
701  {
702  parser->flags = 0;
703  parser->content_length = -1;
704 
705  if (ch == 'H') {
706  state = s_res_or_resp_H;
707  } else {
708  parser->type = HTTP_REQUEST;
709  state = s_start_req;
710  goto reexecute_byte;
711  }
712 
713  break;
714  }
715 
716  case s_res_or_resp_H:
717  if (ch == 'T') {
718  parser->type = HTTP_RESPONSE;
719  state = s_res_HT;
720  } else {
721  if (ch != 'E') {
723  goto error;
724  }
725 
726  parser->type = HTTP_REQUEST;
727  parser->method = HTTP_HEAD;
728  parser->index = 2;
729  state = s_req_method;
730  }
731  break;
732 
733  case s_pre_start_res:
734  if (ch == CR || ch == LF)
735  break;
736  state = s_start_res;
737  CALLBACK_NOTIFY_NOADVANCE(message_begin);
738  goto reexecute_byte;
739 
740  case s_start_res:
741  {
742  parser->flags = 0;
743  parser->content_length = -1;
744 
745  switch (ch) {
746  case 'H':
747  state = s_res_H;
748  break;
749 
750  default:
752  goto error;
753  }
754 
755  break;
756  }
757 
758  case s_res_H:
759  STRICT_CHECK(ch != 'T');
760  state = s_res_HT;
761  break;
762 
763  case s_res_HT:
764  STRICT_CHECK(ch != 'T');
765  state = s_res_HTT;
766  break;
767 
768  case s_res_HTT:
769  STRICT_CHECK(ch != 'P');
770  state = s_res_HTTP;
771  break;
772 
773  case s_res_HTTP:
774  STRICT_CHECK(ch != '/');
775  state = s_res_first_http_major;
776  break;
777 
779  if (ch < '0' || ch > '9') {
781  goto error;
782  }
783 
784  parser->http_major = ch - '0';
785  state = s_res_http_major;
786  break;
787 
788  /* major HTTP version or dot */
789  case s_res_http_major:
790  {
791  if (ch == '.') {
792  state = s_res_first_http_minor;
793  break;
794  }
795 
796  if (!IS_NUM(ch)) {
798  goto error;
799  }
800 
801  parser->http_major *= 10;
802  parser->http_major += ch - '0';
803 
804  if (parser->http_major > 999) {
806  goto error;
807  }
808 
809  break;
810  }
811 
812  /* first digit of minor HTTP version */
814  if (!IS_NUM(ch)) {
816  goto error;
817  }
818 
819  parser->http_minor = ch - '0';
820  state = s_res_http_minor;
821  break;
822 
823  /* minor HTTP version or end of request line */
824  case s_res_http_minor:
825  {
826  if (ch == ' ') {
827  state = s_res_first_status_code;
828  break;
829  }
830 
831  if (!IS_NUM(ch)) {
833  goto error;
834  }
835 
836  parser->http_minor *= 10;
837  parser->http_minor += ch - '0';
838 
839  if (parser->http_minor > 999) {
841  goto error;
842  }
843 
844  break;
845  }
846 
848  {
849  if (!IS_NUM(ch)) {
850  if (ch == ' ') {
851  break;
852  }
853 
855  goto error;
856  }
857  parser->status_code = ch - '0';
858  state = s_res_status_code;
859  break;
860  }
861 
862  case s_res_status_code:
863  {
864  if (!IS_NUM(ch)) {
865  switch (ch) {
866  case ' ':
867  state = s_res_status;
868  break;
869  case CR:
870  state = s_res_line_almost_done;
871  break;
872  case LF:
873  state = s_header_field_start;
874  break;
875  default:
877  goto error;
878  }
879  break;
880  }
881 
882  parser->status_code *= 10;
883  parser->status_code += ch - '0';
884 
885  if (parser->status_code > 999) {
887  goto error;
888  }
889 
890  break;
891  }
892 
893  case s_res_status:
894  /* the human readable status. e.g. "NOT FOUND" */
895  MARK(reason);
896  if (ch == CR) {
897  state = s_res_line_almost_done;
898  CALLBACK_DATA(reason);
899  break;
900  }
901 
902  if (ch == LF) {
903  state = s_header_field_start;
904  CALLBACK_DATA(reason);
905  break;
906  }
907  break;
908 
910  STRICT_CHECK(ch != LF);
911  state = s_header_field_start;
912  break;
913 
914  case s_pre_start_req:
915  if (ch == CR || ch == LF) {
916  break;
917  }
918  state = s_start_req;
919  CALLBACK_NOTIFY_NOADVANCE(message_begin);
920  goto reexecute_byte;
921 
922  case s_start_req:
923  {
924  parser->flags = 0;
925  parser->content_length = -1;
926 
927  if (!IS_ALPHA(ch)) {
929  goto error;
930  }
931 
932  parser->method = (enum http_method) 0;
933  parser->index = 1;
934  switch (ch) {
935  case 'C': parser->method = HTTP_CONNECT; /* or COPY, CHECKOUT */ break;
936  case 'D': parser->method = HTTP_DELETE; break;
937  case 'G': parser->method = HTTP_GET; break;
938  case 'H': parser->method = HTTP_HEAD; break;
939  case 'L': parser->method = HTTP_LOCK; break;
940  case 'M': parser->method = HTTP_MKCOL; /* or MOVE, MKACTIVITY, MERGE, M-SEARCH */ break;
941  case 'N': parser->method = HTTP_NOTIFY; break;
942  case 'O': parser->method = HTTP_OPTIONS; break;
943  case 'P': parser->method = HTTP_POST;
944  /* or PROPFIND or PROPPATCH or PUT or PATCH */
945  break;
946  case 'R': parser->method = HTTP_REPORT; break;
947  case 'S': parser->method = HTTP_SUBSCRIBE; break;
948  case 'T': parser->method = HTTP_TRACE; break;
949  case 'U': parser->method = HTTP_UNLOCK; /* or UNSUBSCRIBE */ break;
950  default:
952  goto error;
953  }
954  state = s_req_method;
955 
956  break;
957  }
958 
959  case s_req_method:
960  {
961  if (ch == '\0') {
963  goto error;
964  }
965 
966  const char *matcher = method_strings[parser->method];
967  if (ch == ' ' && matcher[parser->index] == '\0') {
968  state = s_req_spaces_before_url;
969  } else if (ch == matcher[parser->index]) {
970  ; /* nada */
971  } else if (parser->method == HTTP_CONNECT) {
972  if (parser->index == 1 && ch == 'H') {
973  parser->method = HTTP_CHECKOUT;
974  } else if (parser->index == 2 && ch == 'P') {
975  parser->method = HTTP_COPY;
976  } else {
977  goto error;
978  }
979  } else if (parser->method == HTTP_MKCOL) {
980  if (parser->index == 1 && ch == 'O') {
981  parser->method = HTTP_MOVE;
982  } else if (parser->index == 1 && ch == 'E') {
983  parser->method = HTTP_MERGE;
984  } else if (parser->index == 1 && ch == '-') {
985  parser->method = HTTP_MSEARCH;
986  } else if (parser->index == 2 && ch == 'A') {
987  parser->method = HTTP_MKACTIVITY;
988  } else {
989  goto error;
990  }
991  } else if (parser->index == 1 && parser->method == HTTP_POST) {
992  if (ch == 'R') {
993  parser->method = HTTP_PROPFIND; /* or HTTP_PROPPATCH */
994  } else if (ch == 'U') {
995  parser->method = HTTP_PUT;
996  } else if (ch == 'A') {
997  parser->method = HTTP_PATCH;
998  } else {
999  goto error;
1000  }
1001  } else if (parser->index == 2 && parser->method == HTTP_UNLOCK && ch == 'S') {
1002  parser->method = HTTP_UNSUBSCRIBE;
1003  } else if (parser->index == 4 && parser->method == HTTP_PROPFIND && ch == 'P') {
1004  parser->method = HTTP_PROPPATCH;
1005  } else {
1007  goto error;
1008  }
1009 
1010  ++parser->index;
1011  break;
1012  }
1013 
1015  {
1016  if (ch == ' ') break;
1017 
1018  // CONNECT requests must be followed by a <host>:<port>
1019  if (parser->method == HTTP_CONNECT) {
1020  MARK(url);
1021  state = s_req_host_start;
1022  goto reexecute_byte;
1023  }
1024 
1025  if (ch == '/' || ch == '*') {
1026  MARK(url);
1027  state = s_req_path;
1028  break;
1029  }
1030 
1031  /* Proxied requests are followed by scheme of an absolute URI (alpha).
1032  * All other methods are followed by '/' or '*' (handled above).
1033  */
1034  if (IS_ALPHA(ch)) {
1035  MARK(url);
1036  state = s_req_schema;
1037  break;
1038  }
1039 
1041  goto error;
1042  }
1043 
1044  case s_req_schema:
1045  {
1046  if (IS_ALPHA(ch)) break;
1047 
1048  if (ch == ':') {
1049  state = s_req_schema_slash;
1050  break;
1051  }
1052 
1054  goto error;
1055  }
1056 
1057  case s_req_schema_slash:
1058  STRICT_CHECK(ch != '/');
1059  state = s_req_schema_slash_slash;
1060  break;
1061 
1063  STRICT_CHECK(ch != '/');
1064  state = s_req_host_start;
1065  break;
1066 
1067  case s_req_host_start:
1068  if (ch == '[') {
1069  state = s_req_host_ipv6;
1070  break;
1071  } else if (IS_ALPHANUM(ch)) {
1072  state = s_req_host;
1073  break;
1074  }
1075 
1077  goto error;
1078 
1079  case s_req_host:
1080  if (IS_HOST_CHAR(ch)) break;
1081  state = s_req_host_done;
1082  goto reexecute_byte;
1083 
1084  case s_req_host_ipv6:
1085  if (IS_HEX(ch) || ch == ':') break;
1086  if (ch == ']') {
1087  state = s_req_host_done;
1088  break;
1089  }
1090 
1092  goto error;
1093 
1094  case s_req_host_done:
1095  switch (ch) {
1096  case ':':
1097  state = s_req_port;
1098  break;
1099  case '/':
1100  state = s_req_path;
1101  break;
1102  case ' ':
1103  /* The request line looks like:
1104  * "GET http://foo.bar.com HTTP/1.1"
1105  * That is, there is no path.
1106  */
1107  state = s_req_http_start;
1108  CALLBACK_DATA(url);
1109  break;
1110  case '?':
1111  state = s_req_query_string_start;
1112  break;
1113  default:
1115  goto error;
1116  }
1117 
1118  break;
1119 
1120  case s_req_port:
1121  {
1122  if (IS_NUM(ch)) break;
1123  switch (ch) {
1124  case '/':
1125  state = s_req_path;
1126  break;
1127  case ' ':
1128  /* The request line looks like:
1129  * "GET http://foo.bar.com:1234 HTTP/1.1"
1130  * That is, there is no path.
1131  */
1132  state = s_req_http_start;
1133  CALLBACK_DATA(url);
1134  break;
1135  case '?':
1136  state = s_req_query_string_start;
1137  break;
1138  default:
1140  goto error;
1141  }
1142  break;
1143  }
1144 
1145  case s_req_path:
1146  {
1147  if (IS_URL_CHAR(ch)) break;
1148 
1149  switch (ch) {
1150  case ' ':
1151  state = s_req_http_start;
1152  CALLBACK_DATA(url);
1153  break;
1154  case CR:
1155  parser->http_major = 0;
1156  parser->http_minor = 9;
1157  state = s_headers_almost_done;
1158  CALLBACK_DATA(url);
1159  break;
1160  case LF:
1161  parser->http_major = 0;
1162  parser->http_minor = 9;
1163  state = s_headers_almost_done;
1164  CALLBACK_DATA(url);
1165  goto reexecute_byte;
1166  break;
1167  case '?':
1168  state = s_req_query_string_start;
1169  break;
1170  case '#':
1171  state = s_req_fragment_start;
1172  break;
1173  default:
1175  goto error;
1176  }
1177  break;
1178  }
1179 
1181  {
1182  if (IS_URL_CHAR(ch)) {
1183  state = s_req_query_string;
1184  break;
1185  }
1186 
1187  switch (ch) {
1188  case '?':
1189  break; /* XXX ignore extra '?' ... is this right? */
1190  case ' ':
1191  state = s_req_http_start;
1192  CALLBACK_DATA(url);
1193  break;
1194  case CR:
1195  parser->http_major = 0;
1196  parser->http_minor = 9;
1197  state = s_headers_almost_done;
1198  CALLBACK_DATA(url);
1199  break;
1200  case LF:
1201  parser->http_major = 0;
1202  parser->http_minor = 9;
1203  state = s_headers_almost_done;
1204  CALLBACK_DATA(url);
1205  goto reexecute_byte;
1206  break;
1207  case '#':
1208  state = s_req_fragment_start;
1209  break;
1210  default:
1212  goto error;
1213  }
1214  break;
1215  }
1216 
1217  case s_req_query_string:
1218  {
1219  if (IS_URL_CHAR(ch)) break;
1220 
1221  switch (ch) {
1222  case '?':
1223  /* allow extra '?' in query string */
1224  break;
1225  case ' ':
1226  state = s_req_http_start;
1227  CALLBACK_DATA(url);
1228  break;
1229  case CR:
1230  parser->http_major = 0;
1231  parser->http_minor = 9;
1232  state = s_headers_almost_done;
1233  CALLBACK_DATA(url);
1234  break;
1235  case LF:
1236  parser->http_major = 0;
1237  parser->http_minor = 9;
1238  state = s_headers_almost_done;
1239  CALLBACK_DATA(url);
1240  goto reexecute_byte;
1241  break;
1242  case '#':
1243  state = s_req_fragment_start;
1244  break;
1245  default:
1247  goto error;
1248  }
1249  break;
1250  }
1251 
1252  case s_req_fragment_start:
1253  {
1254  if (IS_URL_CHAR(ch)) {
1255  state = s_req_fragment;
1256  break;
1257  }
1258 
1259  switch (ch) {
1260  case ' ':
1261  state = s_req_http_start;
1262  CALLBACK_DATA(url);
1263  break;
1264  case CR:
1265  parser->http_major = 0;
1266  parser->http_minor = 9;
1267  state = s_headers_almost_done;
1268  CALLBACK_DATA(url);
1269  break;
1270  case LF:
1271  parser->http_major = 0;
1272  parser->http_minor = 9;
1273  state = s_headers_almost_done;
1274  CALLBACK_DATA(url);
1275  goto reexecute_byte;
1276  break;
1277  case '?':
1278  state = s_req_fragment;
1279  break;
1280  case '#':
1281  break;
1282  default:
1284  goto error;
1285  }
1286  break;
1287  }
1288 
1289  case s_req_fragment:
1290  {
1291  if (IS_URL_CHAR(ch)) break;
1292 
1293  switch (ch) {
1294  case ' ':
1295  state = s_req_http_start;
1296  CALLBACK_DATA(url);
1297  break;
1298  case CR:
1299  parser->http_major = 0;
1300  parser->http_minor = 9;
1301  state = s_headers_almost_done;
1302  CALLBACK_DATA(url);
1303  break;
1304  case LF:
1305  parser->http_major = 0;
1306  parser->http_minor = 9;
1307  state = s_headers_almost_done;
1308  CALLBACK_DATA(url);
1309  goto reexecute_byte;
1310  break;
1311  case '?':
1312  case '#':
1313  break;
1314  default:
1316  goto error;
1317  }
1318  break;
1319  }
1320 
1321  case s_req_http_start:
1322  switch (ch) {
1323  case 'H':
1324  state = s_req_http_H;
1325  break;
1326  case ' ':
1327  break;
1328  default:
1330  goto error;
1331  }
1332  break;
1333 
1334  case s_req_http_H:
1335  STRICT_CHECK(ch != 'T');
1336  state = s_req_http_HT;
1337  break;
1338 
1339  case s_req_http_HT:
1340  STRICT_CHECK(ch != 'T');
1341  state = s_req_http_HTT;
1342  break;
1343 
1344  case s_req_http_HTT:
1345  STRICT_CHECK(ch != 'P');
1346  state = s_req_http_HTTP;
1347  break;
1348 
1349  case s_req_http_HTTP:
1350  STRICT_CHECK(ch != '/');
1351  state = s_req_first_http_major;
1352  break;
1353 
1354  /* first digit of major HTTP version */
1356  if (ch < '0' || ch > '9') {
1358  goto error;
1359  }
1360 
1361  parser->http_major = ch - '0';
1362  state = s_req_http_major;
1363  break;
1364 
1365  /* major HTTP version or dot */
1366  case s_req_http_major:
1367  {
1368  if (ch == '.') {
1369  state = s_req_first_http_minor;
1370  break;
1371  }
1372 
1373  if (!IS_NUM(ch)) {
1375  goto error;
1376  }
1377 
1378  parser->http_major *= 10;
1379  parser->http_major += ch - '0';
1380 
1381  if (parser->http_major > 999) {
1383  goto error;
1384  }
1385 
1386  break;
1387  }
1388 
1389  /* first digit of minor HTTP version */
1391  if (!IS_NUM(ch)) {
1393  goto error;
1394  }
1395 
1396  parser->http_minor = ch - '0';
1397  state = s_req_http_minor;
1398  break;
1399 
1400  /* minor HTTP version or end of request line */
1401  case s_req_http_minor:
1402  {
1403  if (ch == CR) {
1404  if (parser->http_major== 0 && parser->http_minor == 9) {
1405  state = s_headers_almost_done;
1406  } else {
1407  state = s_req_line_almost_done;
1408  }
1409  break;
1410  }
1411 
1412  if (ch == LF) {
1413  if (parser->http_major == 0 && parser->http_minor == 9) {
1414  state = s_headers_almost_done;
1415  goto reexecute_byte;
1416  } else {
1417  state = s_header_field_start;
1418  }
1419  break;
1420  }
1421 
1422  /* XXX allow spaces after digit? */
1423 
1424  if (!IS_NUM(ch)) {
1426  goto error;
1427  }
1428 
1429  parser->http_minor *= 10;
1430  parser->http_minor += ch - '0';
1431 
1432  if (parser->http_minor > 999) {
1434  goto error;
1435  }
1436 
1437  break;
1438  }
1439 
1440  /* end of request line */
1442  {
1443  if (ch != LF) {
1445  goto error;
1446  }
1447 
1448  state = s_header_field_start;
1449  break;
1450  }
1451 
1452  case s_header_field_start:
1453  {
1454  if (ch == CR) {
1455  state = s_headers_almost_done;
1456  break;
1457  }
1458 
1459  if (ch == LF) {
1460  /* they might be just sending \n instead of \r\n so this would be
1461  * the second \n to denote the end of headers*/
1462  state = s_headers_almost_done;
1463  goto reexecute_byte;
1464  }
1465 
1466  c = TOKEN(ch);
1467 
1468  if (!c) {
1470  goto error;
1471  }
1472 
1473  MARK(header_field);
1474 
1475  parser->index = 0;
1476  state = s_header_field;
1477 
1478  switch (c) {
1479  case 'c':
1481  break;
1482 
1483  case 't':
1485  break;
1486 
1487  case 'u':
1488  parser->header_state = h_matching_upgrade;
1489  break;
1490 
1491  default:
1492  parser->header_state = h_general;
1493  break;
1494  }
1495  break;
1496  }
1497 
1498  case s_header_field:
1499  {
1500  c = TOKEN(ch);
1501 
1502  if (c) {
1503  switch (parser->header_state) {
1504  case h_general:
1505 
1506  // fast-forwarding, wheeeeeee!
1507  #define MOVE_THE_HEAD do { \
1508  ++p; \
1509  if (!TOKEN(*p)) { \
1510  ch = *p; \
1511  goto notatoken; \
1512  } \
1513  } while(0);
1514 
1515  if (data + len - p >= 9) {
1524  } else if (data + len - p >= 4) {
1528  }
1529 
1530  break;
1531 
1532  /* content-length */
1533 
1535  parser->index++;
1536  if (parser->index > sizeof(CONTENT_LENGTH)-1
1537  || c != CONTENT_LENGTH[parser->index]) {
1538  parser->header_state = h_general;
1539  } else if (parser->index == sizeof(CONTENT_LENGTH)-2) {
1540  parser->header_state = h_content_length;
1541  }
1542  break;
1543 
1544  /* transfer-encoding */
1545 
1547  parser->index++;
1548  if (parser->index > sizeof(TRANSFER_ENCODING)-1
1549  || c != TRANSFER_ENCODING[parser->index]) {
1550  parser->header_state = h_general;
1551  } else if (parser->index == sizeof(TRANSFER_ENCODING)-2) {
1553  }
1554  break;
1555 
1556  /* upgrade */
1557 
1558  case h_matching_upgrade:
1559  parser->index++;
1560  if (parser->index > sizeof(UPGRADE)-1
1561  || c != UPGRADE[parser->index]) {
1562  parser->header_state = h_general;
1563  } else if (parser->index == sizeof(UPGRADE)-2) {
1564  parser->header_state = h_upgrade;
1565  }
1566  break;
1567 
1568  case h_content_length:
1569  case h_transfer_encoding:
1570  case h_upgrade:
1571  if (ch != ' ') parser->header_state = h_general;
1572  break;
1573 
1574  default:
1575  assert(0 && "Unknown header_state");
1576  break;
1577  }
1578  break;
1579  }
1580 
1581  notatoken:
1582  if (ch == ':') {
1583  state = s_header_value_start;
1584  // do not allow headers with trailing whitespaces
1585  // https://tools.ietf.org/html/rfc7230#section-3.2.4
1586  if (p - header_field_mark > 1 &&
1587  data[p - data - 1] == ' ') {
1589  goto error;
1590  }
1591  CALLBACK_DATA(header_field);
1592  break;
1593  }
1594 
1596  goto error;
1597  }
1598 
1599  case s_header_value_start:
1600  {
1601  if (ch == ' ' || ch == '\t') break;
1602 
1603  MARK(header_value);
1604 
1605  state = s_header_value;
1606  parser->index = 0;
1607 
1608  // Error out if a content_length, transfer_encoding, or upgrade header
1609  // was present with no actual value. These headers correspond with
1610  // special parser states that without the below accept empty header
1611  // values and so we can reject such requests here in the parser.
1612  // If more headers are added, can consider moving to a hash/map based
1613  // model below.
1614  if (ch == CR || ch == LF) {
1615  if (parser->header_state == h_content_length) {
1617  } else if (parser->header_state == h_transfer_encoding) {
1619  } else if (parser->header_state == h_upgrade) {
1621  }
1622 
1623  if (parser->http_errno != HPE_OK) {
1624  goto error;
1625  }
1626  }
1627 
1628  if (ch == CR) {
1629  STRICT_CHECK(parser->quote != 0);
1630  parser->header_state = h_general;
1631  state = s_header_almost_done;
1632  CALLBACK_DATA(header_value);
1633  break;
1634  }
1635 
1636  if (ch == LF) {
1637  STRICT_CHECK(parser->quote != 0);
1638  state = s_header_field_start;
1639  CALLBACK_DATA(header_value);
1640  break;
1641  }
1642 
1643  c = LOWER(ch);
1644 
1645  switch (parser->header_state) {
1646  case h_upgrade:
1647  parser->flags |= F_UPGRADE;
1648  parser->header_state = h_general;
1649  break;
1650 
1651  case h_transfer_encoding:
1652  /* looking for 'Transfer-Encoding: chunked' */
1653  if ('c' == c) {
1655  } else {
1656  parser->header_state = h_general;
1657  }
1658  break;
1659 
1660  case h_content_length:
1661  if (!IS_NUM(ch)) {
1663  goto error;
1664  }
1665 
1666  parser->content_length = ch - '0';
1667  break;
1668 
1669  default:
1670  parser->header_state = ch == QT ? h_general_and_quote : h_general;
1671  break;
1672  }
1673  break;
1674  }
1675 
1676  case s_header_value:
1677  {
1678  cr_or_lf_or_qt:
1679  if (ch == CR &&
1681  state = s_header_almost_done;
1682  CALLBACK_DATA(header_value);
1683  break;
1684  }
1685 
1686  if (ch == LF &&
1688  state = s_header_almost_done;
1689  CALLBACK_DATA_NOADVANCE(header_value);
1690  goto reexecute_byte;
1691  }
1692 
1693  if (!lenient && !IS_HEADER_CHAR(ch) &&
1696  goto error;
1697  }
1698 
1699  switch (parser->header_state) {
1700  case h_general:
1701  if (ch == QT) {
1703  }
1704 
1705  // fast-forwarding, wheee!
1706  #define MOVE_FAST do { \
1707  ++p; \
1708  ch = *p; \
1709  if (ch == CR || ch == LF || ch == QT || \
1710  ch == BS || !IS_HEADER_CHAR(ch)) { \
1711  goto cr_or_lf_or_qt; \
1712  } \
1713  } while(0);
1714 
1715  if (data + len - p >= 12) {
1716  MOVE_FAST
1717  MOVE_FAST
1718  MOVE_FAST
1719  MOVE_FAST
1720  MOVE_FAST
1721  MOVE_FAST
1722  MOVE_FAST
1723  MOVE_FAST
1724  MOVE_FAST
1725  MOVE_FAST
1726  MOVE_FAST
1727  } else if (data + len - p >= 5) {
1728  MOVE_FAST
1729  MOVE_FAST
1730  MOVE_FAST
1731  MOVE_FAST
1732  }
1733 
1734  break;
1735 
1736  case h_general_and_quote:
1737  if (ch == QT) {
1738  parser->header_state = h_general;
1739  } else if (ch == BS) {
1741  }
1742  break;
1743 
1746  break;
1747 
1748  // Not sure the below is relevant anymore as from
1749  // s_header_value_start it appears as though we can never
1750  // be in the situation below
1751  case h_transfer_encoding:
1753  goto error;
1754  break;
1755 
1756  case h_content_length:
1757  if (ch == ' ') break;
1758  if (!IS_NUM(ch)) {
1760  goto error;
1761  }
1762 
1763  if (parser->content_length > ((INT64_MAX - 10) / 10)) {
1764  /* overflow */
1766  goto error;
1767  }
1768 
1769  parser->content_length *= 10;
1770  parser->content_length += ch - '0';
1771  break;
1772 
1773  /* Transfer-Encoding: chunked */
1775  parser->index++;
1776  if (parser->index > sizeof(CHUNKED)-1
1777  || LOWER(ch) != CHUNKED[parser->index]) {
1778  parser->header_state = h_general;
1779  } else if (parser->index == sizeof(CHUNKED)-2) {
1781  }
1782  break;
1783 
1785  if (ch != ' ') {
1786  parser->header_state = h_general;
1787  }
1788  break;
1789 
1790  default:
1791  state = s_header_value;
1792  parser->header_state = h_general;
1793  break;
1794  }
1795  break;
1796  }
1797 
1798  case s_header_almost_done:
1799  {
1800  if (ch == LF) {
1801  state = s_header_value_lws;
1802  } else {
1803  state = s_header_value;
1804  }
1805 
1806  switch (parser->header_state) {
1808  parser->flags |= F_CHUNKED;
1809  break;
1810  default:
1811  break;
1812  }
1813 
1814  if (ch != LF) {
1815  CALLBACK_SPACE(header_value);
1816  }
1817 
1818  break;
1819  }
1820 
1821  case s_header_value_lws:
1822  {
1823  if (ch == ' ' || ch == '\t')
1824  {
1825  state = s_header_value_start;
1826  CALLBACK_SPACE(header_value);
1827  }
1828  else
1829  {
1830  state = s_header_field_start;
1831  goto reexecute_byte;
1832  }
1833  break;
1834  }
1835 
1836  case s_headers_almost_done:
1837  {
1838  STRICT_CHECK(ch != LF);
1839 
1840  if (ch != LF) {
1842  goto error;
1843  }
1844 
1845  if (parser->flags & F_TRAILING) {
1846  /* End of a chunked request */
1847  state = s_message_done;
1848  CALLBACK_NOTIFY_NOADVANCE(chunk_complete);
1849  goto reexecute_byte;
1850  }
1851 
1852  state = s_headers_done;
1853 
1854  /* Set this here so that on_headers_complete() callbacks can see it */
1855  parser->upgrade =
1856  (parser->flags & F_UPGRADE || parser->method == HTTP_CONNECT);
1857 
1858  /* Here we call the headers_complete callback. This is somewhat
1859  * different than other callbacks because if the user returns 1, we
1860  * will interpret that as saying that this message has no body. This
1861  * is needed for the annoying case of receiving a response to a HEAD
1862  * request.
1863  *
1864  * We'd like to use CALLBACK_NOTIFY_NOADVANCE() here but we cannot, so
1865  * we have to simulate it by handling a change in errno below.
1866  */
1867  size_t header_size = p - data + 1;
1868  switch (settings->on_headers_complete(parser, nullptr, header_size)) {
1869  case 0:
1870  break;
1871 
1872  case 1:
1873  parser->flags |= F_SKIPBODY;
1874  break;
1875 
1876  default:
1878  RETURN(p - data); /* Error */
1879  }
1880 
1881  if (HTTP_PARSER_ERRNO(parser) != HPE_OK) {
1882  RETURN(p - data);
1883  }
1884 
1885  goto reexecute_byte;
1886  }
1887 
1888  case s_headers_done:
1889  {
1890  STRICT_CHECK(ch != LF);
1891 
1892  // we're done parsing headers, reset overflow counters
1893  parser->nread = 0;
1894  // (if we now move to s_body_*, then this is irrelevant)
1895  data_or_header_data_start = p;
1896 
1897  int hasBody = parser->flags & F_CHUNKED || parser->content_length > 0;
1898  if (parser->upgrade && (parser->method == HTTP_CONNECT ||
1899  (parser->flags & F_SKIPBODY) || !hasBody)) {
1900  /* Exit, the rest of the message is in a different protocol. */
1901  state = NEW_MESSAGE();
1902  CALLBACK_NOTIFY(message_complete);
1903  RETURN((p - data) + 1);
1904  }
1905 
1906  if (parser->flags & F_SKIPBODY) {
1907  state = NEW_MESSAGE();
1908  CALLBACK_NOTIFY(message_complete);
1909  } else if (parser->flags & F_CHUNKED) {
1910  /* chunked encoding - ignore Content-Length header */
1911  state = s_chunk_size_start;
1912  } else {
1913  if (parser->content_length == 0) {
1914  /* Content-Length header given but zero: Content-Length: 0\r\n */
1915  state = NEW_MESSAGE();
1916  CALLBACK_NOTIFY(message_complete);
1917  } else if (parser->content_length > 0) {
1918  /* Content-Length header given and non-zero */
1919  state = s_body_identity;
1920  } else {
1921  unsigned short sc = parser->status_code;
1922  if (parser->type == HTTP_REQUEST ||
1923  ((100 <= sc && sc <= 199) || sc == 204 || sc == 304)) {
1924  /* Assume content-length 0 - read the next */
1925  state = NEW_MESSAGE();
1926  CALLBACK_NOTIFY(message_complete);
1927  } else {
1928  /* Read body until EOF */
1929  state = s_body_identity_eof;
1930  }
1931  }
1932  }
1933 
1934  break;
1935  }
1936 
1937  case s_body_identity:
1938  {
1939  uint64_t to_read = MIN(parser->content_length, (data + len) - p);
1940 
1941  assert(parser->content_length > 0);
1942 
1943  /* The difference between advancing content_length and p is because
1944  * the latter will automatically advance on the next loop iteration.
1945  * Further, if content_length ends up at 0, we want to see the last
1946  * byte again for our message complete callback.
1947  */
1948  MARK(body);
1949  parser->content_length -= to_read;
1950  p += to_read - 1;
1951 
1952  if (parser->content_length == 0) {
1953  state = s_message_done;
1954 
1955  /* Mimic CALLBACK_DATA_NOADVANCE() but with one extra byte.
1956  *
1957  * The alternative to doing this is to wait for the next byte to
1958  * trigger the data callback, just as in every other case. The
1959  * problem with this is that this makes it difficult for the test
1960  * harness to distinguish between complete-on-EOF and
1961  * complete-on-length. It's not clear that this distinction is
1962  * important for applications, but let's keep it for now.
1963  */
1964  _CALLBACK_DATA(body, p - body_mark + 1, p - data);
1965  goto reexecute_byte;
1966  }
1967 
1968  break;
1969  }
1970 
1971  /* read until EOF */
1972  case s_body_identity_eof:
1973  MARK(body);
1974  p = data + len - 1;
1975 
1976  break;
1977 
1978  case s_message_done:
1979  state = NEW_MESSAGE();
1980  parser->nread = 0;
1981  data_or_header_data_start = p;
1982  CALLBACK_NOTIFY(message_complete);
1983  if (parser->upgrade) {
1984  /* Exit, the rest of the message is in a different protocol. */
1985  RETURN((p - data) + 1);
1986  }
1987  break;
1988 
1989  case s_chunk_size_start:
1990  {
1991  assert(parser->flags & F_CHUNKED);
1992 
1993  unhex_val = unhex[(unsigned char)ch];
1994  if (unhex_val == -1) {
1996  goto error;
1997  }
1998 
1999  parser->content_length = unhex_val;
2000  state = s_chunk_size;
2001  break;
2002  }
2003 
2004  case s_chunk_size:
2005  {
2006  assert(parser->flags & F_CHUNKED);
2007 
2008  if (ch == CR) {
2009  state = s_chunk_size_almost_done;
2010  break;
2011  }
2012 
2013  unhex_val = unhex[(unsigned char)ch];
2014 
2015  if (unhex_val == -1) {
2016  if (ch == ';' || ch == ' ') {
2017  state = s_chunk_parameters;
2018  break;
2019  }
2020 
2022  goto error;
2023  }
2024 
2025  if (parser->content_length > (INT64_MAX - unhex_val) >> 4) {
2026  /* overflow */
2028  goto error;
2029  }
2030  parser->content_length *= 16;
2031  parser->content_length += unhex_val;
2032  break;
2033  }
2034 
2035  case s_chunk_parameters:
2036  {
2037  assert(parser->flags & F_CHUNKED);
2038  /*
2039  * just ignore this shit. TODO check for overflow
2040  * TODO: It would be nice to pass this information to the
2041  * on_chunk_header callback.
2042  */
2043  if (ch == CR) {
2044  state = s_chunk_size_almost_done;
2045  break;
2046  }
2047  break;
2048  }
2049 
2051  {
2052  assert(parser->flags & F_CHUNKED);
2053  STRICT_CHECK(ch != LF);
2054 
2055  if (parser->content_length == 0) {
2056  parser->flags |= F_TRAILING;
2057  state = s_header_field_start;
2058  CALLBACK_NOTIFY(chunk_header);
2059  } else {
2060  state = s_chunk_data;
2061  CALLBACK_NOTIFY(chunk_header);
2062  }
2063  break;
2064  }
2065 
2066  case s_chunk_data:
2067  {
2068  uint64_t to_read = MIN(parser->content_length, (data + len) - p);
2069 
2070  assert(parser->flags & F_CHUNKED);
2071  assert(parser->content_length > 0);
2072 
2073  /* See the explanation in s_body_identity for why the content
2074  * length and data pointers are managed this way.
2075  */
2076  MARK(body);
2077  parser->content_length -= to_read;
2078  p += to_read - 1;
2079 
2080  if (parser->content_length == 0) {
2081  state = s_chunk_data_almost_done;
2082  }
2083 
2084  break;
2085  }
2086 
2088  assert(parser->flags & F_CHUNKED);
2089  assert(parser->content_length == 0);
2090  STRICT_CHECK(ch != CR);
2091  state = s_chunk_data_done;
2092  CALLBACK_DATA(body);
2093  break;
2094 
2095  case s_chunk_data_done:
2096  assert(parser->flags & F_CHUNKED);
2097  STRICT_CHECK(ch != LF);
2098  state = s_chunk_size_start;
2099  parser->nread = 0;
2100  data_or_header_data_start = p;
2101  CALLBACK_NOTIFY(chunk_complete);
2102  break;
2103 
2104  default:
2105  assert(0 && "unhandled state");
2107  goto error;
2108  }
2109  }
2110 
2111  /* We can check for overflow here because in Proxygen, len <= ~8KB and so the
2112  * worst thing that can happen is that we catch the overflow at 88KB rather
2113  * than at 80KB.
2114  * In case of chunk encoding, we count the overflow for every
2115  * chunk separately.
2116  * We zero the nread counter (and reset data_or_header_data_start) when we
2117  * start parsing a new message or a new chunk.
2118  */
2119  if (PARSING_HEADER(state)) {
2120  parser->nread += p - data_or_header_data_start;
2121  if (parser->nread > HTTP_MAX_HEADER_SIZE) {
2123  goto error;
2124  }
2125  }
2126 
2127  /* Run callbacks for any marks that we have leftover after we ran out of
2128  * bytes. There should be at most one of these set, so it's OK to invoke
2129  * them in series (unset marks will not result in callbacks).
2130  *
2131  * We use the NOADVANCE() variety of callbacks here because 'p' has already
2132  * overflowed 'data' and this allows us to correct for the off-by-one that
2133  * we'd otherwise have (since CALLBACK_DATA() is meant to be run with a 'p'
2134  * value that's in-bounds).
2135  */
2136 
2137  assert(((header_field_mark ? 1 : 0) +
2138  (header_value_mark ? 1 : 0) +
2139  (url_mark ? 1 : 0) +
2140  (reason_mark ? 1 : 0) +
2141  (body_mark ? 1 : 0)) <= 1);
2142 
2143  CALLBACK_DATA_NOADVANCE(header_field);
2144  CALLBACK_DATA_NOADVANCE(header_value);
2146  CALLBACK_DATA_NOADVANCE(reason);
2148 
2149  RETURN(len);
2150 
2151 error:
2152  if (HTTP_PARSER_ERRNO(parser) == HPE_OK) {
2154  }
2155 
2156  RETURN(p - data);
2157 }
2158 
2160 const char * http_method_str (enum http_method m)
2161 {
2162  return method_strings[m];
2163 }
2164 
2165 
2166 void
2168 {
2169  parser->type = t;
2171  parser->nread = 0;
2172  parser->upgrade = 0;
2173  parser->flags = 0;
2174  parser->method = 0;
2175  parser->http_major = 0;
2176  parser->http_minor = 0;
2177  parser->http_errno = HPE_OK;
2178 }
2179 
2180 const char *
2181 http_errno_name(enum http_errno err) {
2182  assert(err < (sizeof(http_strerror_tab)/sizeof(http_strerror_tab[0])));
2183  return http_strerror_tab[err].name;
2184 }
2185 
2186 const char *
2188  assert(err < (sizeof(http_strerror_tab)/sizeof(http_strerror_tab[0])));
2189  return http_strerror_tab[err].description;
2190 }
2191 
2192 
2193 static enum http_host_state
2194 http_parse_host_char(enum http_host_state s, const char ch) {
2195  switch(s) {
2196  case s_http_userinfo:
2197  case s_http_userinfo_start:
2198  if (ch == '@') {
2199  return s_http_host_start;
2200  }
2201 
2202  if (IS_USERINFO_CHAR(ch)) {
2203  return s_http_userinfo;
2204  }
2205  break;
2206 
2207  case s_http_host_start:
2208  if (ch == '[') {
2209  return s_http_host_v6_start;
2210  }
2211 
2212  if (IS_HOST_CHAR(ch)) {
2213  return s_http_host;
2214  }
2215 
2216  break;
2217 
2218  case s_http_host:
2219  if (IS_HOST_CHAR(ch)) {
2220  return s_http_host;
2221  }
2222 
2223  /* FALLTHROUGH */
2224  case s_http_host_v6_end:
2225  if (ch == ':') {
2226  return s_http_host_port_start;
2227  }
2228 
2229  break;
2230 
2231  case s_http_host_v6:
2232  if (ch == ']') {
2233  return s_http_host_v6_end;
2234  }
2235 
2236  /* FALLTHROUGH */
2237  case s_http_host_v6_start:
2238  if (IS_HEX(ch) || ch == ':' || ch == '.') {
2239  return s_http_host_v6;
2240  }
2241 
2242  break;
2243 
2244  case s_http_host_port:
2246  if (IS_NUM(ch)) {
2247  return s_http_host_port;
2248  }
2249 
2250  break;
2251 
2252  default:
2253  break;
2254  }
2255  return s_http_host_dead;
2256 }
2257 
2258 static int
2259 http_parse_host(const char * buf, struct http_parser_url *u, int found_at) {
2260  enum http_host_state s;
2261 
2262  const char *p;
2263  size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
2264 
2265  u->field_data[UF_HOST].len = 0;
2266 
2267  s = found_at ? s_http_userinfo_start : s_http_host_start;
2268 
2269  for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
2270  enum http_host_state new_s = http_parse_host_char(s, *p);
2271 
2272  if (new_s == s_http_host_dead) {
2273  return 1;
2274  }
2275 
2276  switch(new_s) {
2277  case s_http_host:
2278  if (s != s_http_host) {
2279  u->field_data[UF_HOST].off = p - buf;
2280  }
2281  u->field_data[UF_HOST].len++;
2282  break;
2283 
2284  case s_http_host_v6:
2285  if (s != s_http_host_v6) {
2286  u->field_data[UF_HOST].off = p - buf;
2287  }
2288  u->field_data[UF_HOST].len++;
2289  break;
2290 
2291  case s_http_host_port:
2292  if (s != s_http_host_port) {
2293  u->field_data[UF_PORT].off = p - buf;
2294  u->field_data[UF_PORT].len = 0;
2295  u->field_set |= (1 << UF_PORT);
2296  }
2297  u->field_data[UF_PORT].len++;
2298  break;
2299 
2300  case s_http_userinfo:
2301  if (s != s_http_userinfo) {
2302  u->field_data[UF_USERINFO].off = p - buf ;
2303  u->field_data[UF_USERINFO].len = 0;
2304  u->field_set |= (1 << UF_USERINFO);
2305  }
2306  u->field_data[UF_USERINFO].len++;
2307  break;
2308 
2309  default:
2310  break;
2311  }
2312  s = new_s;
2313  }
2314 
2315  /* Make sure we don't end somewhere unexpected */
2316  switch (s) {
2317  case s_http_host_start:
2318  case s_http_host_v6_start:
2319  case s_http_host_v6:
2321  case s_http_userinfo:
2322  case s_http_userinfo_start:
2323  return 1;
2324  default:
2325  break;
2326  }
2327 
2328  return 0;
2329 }
2330 
2331 
2332 int
2333 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
2334  struct http_parser_url *u)
2335 {
2336  enum state s;
2337  const char *p;
2338  enum http_parser_url_fields uf, old_uf;
2339  int found_at = 0;
2340 
2341  u->port = u->field_set = 0;
2342  s = is_connect ? s_req_server_start : s_req_spaces_before_url;
2343  uf = old_uf = UF_MAX;
2344 
2345  for (p = buf; p < buf + buflen; p++) {
2346  s = parse_url_char(s, *p);
2347 
2348  /* Figure out the next field that we're operating on */
2349  switch (s) {
2350  case s_dead:
2351  return 1;
2352 
2353  /* Skip delimeters */
2354  case s_req_schema_slash:
2356  case s_req_server_start:
2358  case s_req_fragment_start:
2359  continue;
2360 
2361  case s_req_schema:
2362  uf = UF_SCHEMA;
2363  break;
2364 
2365  case s_req_server_with_at:
2366  found_at = 1;
2367 
2368  /* FALLTHROUGH */
2369  case s_req_server:
2370  uf = UF_HOST;
2371  break;
2372 
2373  case s_req_path:
2374  uf = UF_PATH;
2375  break;
2376 
2377  case s_req_query_string:
2378  uf = UF_QUERY;
2379  break;
2380 
2381  case s_req_fragment:
2382  uf = UF_FRAGMENT;
2383  break;
2384 
2385  default:
2386  assert(!"Unexpected state");
2387  return 1;
2388  }
2389 
2390  /* Nothing's changed; soldier on */
2391  if (uf == old_uf) {
2392  u->field_data[uf].len++;
2393  continue;
2394  }
2395 
2396  u->field_data[uf].off = p - buf;
2397  u->field_data[uf].len = 1;
2398 
2399  u->field_set |= (1 << uf);
2400  old_uf = uf;
2401  }
2402 
2403  /* host must be present if there is a schema */
2404  /* parsing http:///toto will fail */
2405  if ((u->field_set & ((1 << UF_SCHEMA) | (1 << UF_HOST))) != 0) {
2406  if (http_parse_host(buf, u, found_at) != 0) {
2407  return 1;
2408  }
2409  }
2410 
2411  /* CONNECT requests can only contain "hostname:port" */
2412  if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
2413  return 1;
2414  }
2415 
2416  if (u->field_set & (1 << UF_PORT)) {
2417  /* Don't bother with endp; we've already validated the string */
2418  unsigned long v = strtoul(buf + u->field_data[UF_PORT].off, nullptr, 10);
2419 
2420  /* Ports have a max value of 2^16 */
2421  if (v > 0xffff) {
2422  return 1;
2423  }
2424 
2425  u->port = (uint16_t) v;
2426  }
2427 
2428  return 0;
2429 }
2430 
2431 void
2432 http_parser_pause(http_parser *parser, int paused) {
2433  /* Users should only be pausing/unpausing a parser that is not in an error
2434  * state. In non-debug builds, there's not much that we can do about this
2435  * other than ignore it.
2436  */
2437  if (HTTP_PARSER_ERRNO(parser) == HPE_OK ||
2438  HTTP_PARSER_ERRNO(parser) == HPE_PAUSED) {
2439  SET_ERRNO((paused) ? HPE_PAUSED : HPE_OK);
2440  } else {
2441  assert(0 && "Attempting to pause parser in error state");
2442  }
2443 }
2444 
2445 #if __cplusplus
2446 }
2447 #endif /* __cplusplus */
static const int8_t unhex[256]
Definition: http_parser.c:219
static struct @0 http_strerror_tab[]
#define HTTP_MAX_HEADER_SIZE
Definition: http_parser.h:64
uint32_t nread
Definition: http_parser.h:220
#define T(v)
Definition: http_parser.c:233
#define IS_HEADER_CHAR(ch)
Definition: http_parser.c:426
static int http_parse_host(const char *buf, struct http_parser_url *u, int found_at)
Definition: http_parser.c:2258
#define HTTP_ERRNO_MAP(XX)
Definition: http_parser.h:139
unsigned char flags
Definition: http_parser.h:215
#define CR
Definition: http_parser.c:395
#define RETURN(r)
Definition: http_parser.c:63
#define CALLBACK_NOTIFY_NOADVANCE(FOR)
Definition: http_parser.c:89
#define IS_ALPHA(c)
Definition: http_parser.c:401
const char * http_errno_name(enum http_errno err)
Definition: http_parser.c:2180
#define CALLBACK_DATA_NOADVANCE(FOR)
Definition: http_parser.c:115
#define MOVE_FAST
#define CALLBACK_SPACE(FOR)
Definition: http_parser.c:119
unsigned short http_minor
Definition: http_parser.h:225
#define CONTENT_LENGTH
Definition: http_parser.c:142
#define IS_ALPHANUM(c)
Definition: http_parser.c:403
#define PARSING_HEADER(state)
Definition: http_parser.c:357
uint16_t field_set
Definition: http_parser.h:283
size_t http_parser_execute(http_parser *parser, const http_parser_settings *settings, const char *data, size_t len)
Definition: http_parser.c:602
#define CALLBACK_DATA(FOR)
Definition: http_parser.c:111
#define TOKEN(c)
Definition: http_parser.c:400
#define MIN(a, b)
Definition: http_parser.c:46
static http_parser_settings settings
Definition: test.c:1529
#define IS_NUM(c)
Definition: http_parser.c:402
#define BS
Definition: http_parser.c:398
static http_parser * parser
Definition: test.c:40
requires And< SemiMovable< VN >... > &&SemiMovable< E > auto error(E e)
Definition: error.h:48
static const uint8_t normal_url_char[256]
Definition: http_parser.c:236
#define LOWER(c)
Definition: http_parser.c:399
auto ch
unsigned short status_code
Definition: http_parser.h:226
#define TRANSFER_ENCODING
Definition: http_parser.c:143
#define HTTP_STRERROR_GEN(n, s)
Definition: http_parser.c:435
const char * http_method_str(enum http_method m)
Definition: http_parser.c:2159
const char * name
Definition: http_parser.c:437
void http_parser_pause(http_parser *parser, int paused)
Definition: http_parser.c:2431
#define MARK(FOR)
Definition: http_parser.c:134
#define HTTP_PARSER_ERRNO(p)
Definition: http_parser.h:202
static const char * method_strings[]
Definition: http_parser.c:149
int http_parser_parse_url(const char *buf, size_t buflen, int is_connect, struct http_parser_url *u)
Definition: http_parser.c:2332
#define _CALLBACK_DATA(FOR, LEN, ER)
Definition: http_parser.c:92
unsigned char state
Definition: http_parser.h:216
#define UPGRADE
Definition: http_parser.c:144
unsigned char method
Definition: http_parser.h:227
const char * http_errno_description(enum http_errno err)
Definition: http_parser.c:2186
unsigned short http_major
Definition: http_parser.h:224
static map< string, int > m
http_data_cb on_headers_complete
Definition: http_parser.h:251
#define MOVE_THE_HEAD
#define NEW_MESSAGE()
Definition: http_parser.c:432
#define CALLBACK_NOTIFY(FOR)
Definition: http_parser.c:86
http_method
Definition: http_parser.h:90
void http_parser_init(http_parser *parser, enum http_parser_type t)
Definition: http_parser.c:2166
#define CHUNKED
Definition: http_parser.c:145
http_parser_type
Definition: http_parser.h:123
#define SET_ERRNO(e)
Definition: http_parser.c:57
unsigned char header_state
Definition: http_parser.h:217
unsigned char http_errno
Definition: http_parser.h:228
#define QT
Definition: http_parser.c:397
#define LF
Definition: http_parser.c:396
http_errno
Definition: http_parser.h:196
static set< string > s
static enum state parse_url_char(enum state s, const char ch)
Definition: http_parser.c:455
#define IS_URL_CHAR(c)
Definition: http_parser.c:416
#define IS_HEX(c)
Definition: http_parser.c:404
unsigned char type
Definition: http_parser.h:214
http_host_state
Definition: http_parser.c:379
#define IS_USERINFO_CHAR(c)
Definition: http_parser.c:408
#define STRICT_CHECK(cond)
Definition: http_parser.c:431
header_states
Definition: http_parser.c:360
char c
static constexpr uint64_t data[1]
Definition: Fingerprint.cpp:43
struct http_parser_url::@1 field_data[UF_MAX]
http_parser_url_fields
Definition: http_parser.h:263
#define IS_HOST_CHAR(c)
Definition: http_parser.c:418
int64_t content_length
Definition: http_parser.h:221
static const char tokens[256]
Definition: http_parser.c:184
state
Definition: http_parser.c:272
static enum http_host_state http_parse_host_char(enum http_host_state s, const char ch)
Definition: http_parser.c:2193
unsigned char index
Definition: http_parser.h:218
const char * description
Definition: http_parser.c:438