From 8da60bc423d0243588854e5c4b2f3ae0c637cb8d Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Fri, 10 Feb 2012 21:06:18 +1000 Subject: [PATCH] implement parsing of v6 addresses and rejection of 0-length host and ports. the v6 parsing works by adding extra states for working with the [] notation for v6 addresses. hosts and ports cannot be 0-length because we url parsing from ending when we expect those fields to begin. http_parser_parse_url gets a free check for the correctness of CONNECT urls (they can only be host:port). this addresses the following issues: i was bored and had my head in this space. --- http_parser.c | 114 ++++++++++++++++++++++++++++++++++++-------------- test.c | 74 ++++++++++++++++++++++++++++++-- 2 files changed, 152 insertions(+), 36 deletions(-) diff --git a/http_parser.c b/http_parser.c index 1affeba..5a11de0 100644 --- a/http_parser.c +++ b/http_parser.c @@ -265,7 +265,10 @@ enum state , s_req_schema , s_req_schema_slash , s_req_schema_slash_slash - , s_req_schema_slash_slash_end + , s_req_host_start + , s_req_host_v6_start + , s_req_host_v6 + , s_req_host_v6_end , s_req_host , s_req_port_start , s_req_port @@ -354,6 +357,7 @@ enum header_states #define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z') #define IS_NUM(c) ((c) >= '0' && (c) <= '9') #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c)) +#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f')) #if HTTP_PARSER_STRICT #define TOKEN(c) (tokens[(unsigned char)c]) @@ -410,22 +414,22 @@ int http_message_needs_eof(http_parser *parser); * URL and non-URL states by looking for these. */ static enum state -parse_url_char(enum state s, const char ch, int is_connect) +parse_url_char(enum state s, const char ch) { assert(!isspace(ch)); switch (s) { case s_req_spaces_before_url: + /* Proxied requests are followed by scheme of an absolute URI (alpha). + * All methods except CONNECT are followed by '/' or '*'. + */ + if (ch == '/' || ch == '*') { return s_req_path; } - /* Proxied requests are followed by scheme of an absolute URI (alpha). - * CONNECT is followed by a hostname, which begins with alphanum. - * All other methods are followed by '/' or '*' (handled above). - */ - if (IS_ALPHA(ch) || (is_connect && IS_NUM(ch))) { - return (is_connect) ? s_req_host : s_req_schema; + if (IS_ALPHA(ch)) { + return s_req_schema; } break; @@ -450,17 +454,29 @@ parse_url_char(enum state s, const char ch, int is_connect) case s_req_schema_slash_slash: if (ch == '/') { - return s_req_schema_slash_slash_end; + return s_req_host_start; + } + + break; + + case s_req_host_start: + if (ch == '[') { + return s_req_host_v6_start; + } + + if (IS_HOST_CHAR(ch)) { + return s_req_host; } break; - case s_req_schema_slash_slash_end: case s_req_host: if (IS_HOST_CHAR(ch)) { return s_req_host; } + /* FALLTHROUGH */ + case s_req_host_v6_end: switch (ch) { case ':': return s_req_port_start; @@ -474,12 +490,19 @@ parse_url_char(enum state s, const char ch, int is_connect) break; - case s_req_port_start: - case s_req_port: - if (IS_NUM(ch)) { - return s_req_port; + case s_req_host_v6: + if (ch == ']') { + return s_req_host_v6_end; } + /* FALLTHROUGH */ + case s_req_host_v6_start: + if (IS_HEX(ch) || ch == ':') { + return s_req_host_v6; + } + break; + + case s_req_port: switch (ch) { case '/': return s_req_path; @@ -488,6 +511,12 @@ parse_url_char(enum state s, const char ch, int is_connect) return s_req_query_string_start; } + /* FALLTHROUGH */ + case s_req_port_start: + if (IS_NUM(ch)) { + return s_req_port; + } + break; case s_req_path: @@ -622,12 +651,15 @@ size_t http_parser_execute (http_parser *parser, case s_req_schema: case s_req_schema_slash: case s_req_schema_slash_slash: - case s_req_schema_slash_slash_end: + case s_req_host_start: + case s_req_host_v6_start: + case s_req_host_v6: + case s_req_host_v6_end: + case s_req_host: case s_req_port_start: case s_req_port: case s_req_query_string_start: case s_req_query_string: - case s_req_host: case s_req_fragment_start: case s_req_fragment: url_mark = data; @@ -975,9 +1007,11 @@ size_t http_parser_execute (http_parser *parser, if (ch == ' ') break; MARK(url); + if (parser->method == HTTP_CONNECT) { + parser->state = s_req_host_start; + } - parser->state = parse_url_char( - (enum state)parser->state, ch, parser->method == HTTP_CONNECT); + parser->state = parse_url_char((enum state)parser->state, ch); if (parser->state == s_dead) { SET_ERRNO(HPE_INVALID_URL); goto error; @@ -989,6 +1023,10 @@ size_t http_parser_execute (http_parser *parser, case s_req_schema: case s_req_schema_slash: case s_req_schema_slash_slash: + case s_req_host_start: + case s_req_host_v6_start: + case s_req_host_v6: + case s_req_port_start: { switch (ch) { /* No whitespace allowed here */ @@ -998,8 +1036,7 @@ size_t http_parser_execute (http_parser *parser, SET_ERRNO(HPE_INVALID_URL); goto error; default: - parser->state = parse_url_char( - (enum state)parser->state, ch, parser->method == HTTP_CONNECT); + parser->state = parse_url_char((enum state)parser->state, ch); if (parser->state == s_dead) { SET_ERRNO(HPE_INVALID_URL); goto error; @@ -1009,9 +1046,8 @@ size_t http_parser_execute (http_parser *parser, break; } - case s_req_schema_slash_slash_end: case s_req_host: - case s_req_port_start: + case s_req_host_v6_end: case s_req_port: case s_req_path: case s_req_query_string_start: @@ -1019,11 +1055,6 @@ size_t http_parser_execute (http_parser *parser, case s_req_fragment_start: case s_req_fragment: { - /* XXX: There is a bug here where if we're on the first character - * of s_req_host (e.g. our URL is 'http://' and we see a whitespace - * character, we'll consider this a valid URL. This seems incorrect, - * but at least it's bug-compatible with what we had before. - */ switch (ch) { case ' ': parser->state = s_req_http_start; @@ -1039,8 +1070,7 @@ size_t http_parser_execute (http_parser *parser, CALLBACK_DATA(url); break; default: - parser->state = parse_url_char( - (enum state)parser->state, ch, parser->method == HTTP_CONNECT); + parser->state = parse_url_char((enum state)parser->state, ch); if (parser->state == s_dead) { SET_ERRNO(HPE_INVALID_URL); goto error; @@ -1926,11 +1956,11 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, enum http_parser_url_fields uf, old_uf; u->port = u->field_set = 0; - s = s_req_spaces_before_url; + s = is_connect ? s_req_host_start : s_req_spaces_before_url; uf = old_uf = UF_MAX; for (p = buf; p < buf + buflen; p++) { - s = parse_url_char(s, *p, is_connect); + s = parse_url_char(s, *p); /* Figure out the next field that we're operating on */ switch (s) { @@ -1940,7 +1970,9 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, /* Skip delimeters */ case s_req_schema_slash: case s_req_schema_slash_slash: - case s_req_schema_slash_slash_end: + case s_req_host_start: + case s_req_host_v6_start: + case s_req_host_v6_end: case s_req_port_start: case s_req_query_string_start: case s_req_fragment_start: @@ -1951,6 +1983,7 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, break; case s_req_host: + case s_req_host_v6: uf = UF_HOST; break; @@ -1988,6 +2021,23 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, old_uf = uf; } + /* CONNECT requests can only contain "hostname:port" */ + if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) { + return 1; + } + + /* Make sure we don't end somewhere unexpected */ + switch (s) { + case s_req_host_v6_start: + case s_req_host_v6: + case s_req_host_v6_end: + case s_req_host: + case s_req_port_start: + return 1; + default: + break; + } + if (u->field_set & (1 << UF_PORT)) { /* Don't bother with endp; we've already validated the string */ unsigned long v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10); diff --git a/test.c b/test.c index 3360fe0..e3a6788 100644 --- a/test.c +++ b/test.c @@ -1948,6 +1948,72 @@ const struct url_test url_tests[] = } ,.rv=0 } + +, {.name="proxy ipv6 request" + ,.url="http://[1:2::3:4]/" + ,.is_connect=0 + ,.u= + {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PATH) + ,.port=0 + ,.field_data= + {{ 0, 4 } /* UF_SCHEMA */ + ,{ 8, 8 } /* UF_HOST */ + ,{ 0, 0 } /* UF_PORT */ + ,{ 17, 1 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + } + } + ,.rv=0 + } + +, {.name="CONNECT ipv6 address" + ,.url="[1:2::3:4]:443" + ,.is_connect=1 + ,.u= + {.field_set=(1 << UF_HOST) | (1 << UF_PORT) + ,.port=443 + ,.field_data= + {{ 0, 0 } /* UF_SCHEMA */ + ,{ 1, 8 } /* UF_HOST */ + ,{ 11, 3 } /* UF_PORT */ + ,{ 0, 0 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + } + } + ,.rv=0 + } + +, {.name="proxy empty host" + ,.url="http://:443/" + ,.is_connect=1 + ,.rv=1 + } + +, {.name="proxy empty port" + ,.url="http://hostname:/" + ,.is_connect=1 + ,.rv=1 + } + +, {.name="CONNECT empty host" + ,.url=":443" + ,.is_connect=1 + ,.rv=1 + } + +, {.name="CONNECT empty port" + ,.url="hostname:" + ,.is_connect=1 + ,.rv=1 + } + +, {.name="CONNECT with extra bits" + ,.url="hostname:443/" + ,.is_connect=1 + ,.rv=1 + } }; void @@ -1993,8 +2059,8 @@ test_parse_url (void) if (test->rv == 0) { if (rv != 0) { - printf("\n*** http_parser_parse_url() \"%s\" test failed, " - "unexpected rv %d ***\n\n", test->name, rv); + printf("\n*** http_parser_parse_url(\"%s\") \"%s\" test failed, " + "unexpected rv %d ***\n\n", test->url, test->name, rv); exit(1); } @@ -2012,8 +2078,8 @@ test_parse_url (void) } else { /* test->rv != 0 */ if (rv == 0) { - printf("\n*** http_parser_parse_url() \"%s\" test failed, " - "unexpected rv %d ***\n\n", test->name, rv); + printf("\n*** http_parser_parse_url(\"%s\") \"%s\" test failed, " + "unexpected rv %d ***\n\n", test->url, test->name, rv); exit(1); } }