From 04995251103354870eb4d6ee1a65e95827721e06 Mon Sep 17 00:00:00 2001 From: David Gwynne Date: Tue, 7 Feb 2012 07:46:43 +1000 Subject: [PATCH] Fix http_parser_parse_url for urls like "http://host/path". Before this change it would include the last slash in the separator between the schema and host as part of the host. we cant use the trick used for skipping the separator before ports, query strings, and fragments because if it was a CONNECT style url string (host:port) it would skip the first character of the hostname. Work around this by introducing a few more states to represent these separators in a url differently to what theyre separating. this in turn lets us simplify the url parsing so can simply skip what it considers delimiters rather than having to special case certain types of url parts and skip their prefixes. Add tests for the http_parser_parse_url(). This compares the http_parser_url struct that http_parser_parse_url() produces against one that we expect from the test. If they differ then http_parser_parse_url() misbehaved. --- http_parser.c | 76 ++++++++++++++++---------------- test.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 37 deletions(-) diff --git a/http_parser.c b/http_parser.c index 83918e0..1affeba 100644 --- a/http_parser.c +++ b/http_parser.c @@ -265,7 +265,9 @@ enum state , s_req_schema , s_req_schema_slash , s_req_schema_slash_slash + , s_req_schema_slash_slash_end , s_req_host + , s_req_port_start , s_req_port , s_req_path , s_req_query_string_start @@ -448,19 +450,20 @@ parse_url_char(enum state s, const char ch, int is_connect) case s_req_schema_slash_slash: if (ch == '/') { - return s_req_host; + return s_req_schema_slash_slash_end; } break; + case s_req_schema_slash_slash_end: case s_req_host: if (IS_HOST_CHAR(ch)) { - return s; + return s_req_host; } switch (ch) { case ':': - return s_req_port; + return s_req_port_start; case '/': return s_req_path; @@ -471,9 +474,10 @@ parse_url_char(enum state s, const char ch, int is_connect) break; + case s_req_port_start: case s_req_port: if (IS_NUM(ch)) { - return s; + return s_req_port; } switch (ch) { @@ -613,17 +617,22 @@ size_t http_parser_execute (http_parser *parser, header_field_mark = data; if (parser->state == s_header_value) header_value_mark = data; - if (parser->state == s_req_path || - parser->state == s_req_schema || - parser->state == s_req_schema_slash || - parser->state == s_req_schema_slash_slash || - parser->state == s_req_port || - parser->state == s_req_query_string_start || - parser->state == s_req_query_string || - parser->state == s_req_host || - parser->state == s_req_fragment_start || - parser->state == s_req_fragment) + switch (parser->state) { + case s_req_path: + case s_req_schema: + case s_req_schema_slash: + case s_req_schema_slash_slash: + case s_req_schema_slash_slash_end: + case s_req_port_start: + case s_req_port: + case s_req_query_string_start: + case s_req_query_string: + case s_req_host: + case s_req_fragment_start: + case s_req_fragment: url_mark = data; + break; + } for (p=data; p != data + len; p++) { ch = *p; @@ -1000,7 +1009,9 @@ size_t http_parser_execute (http_parser *parser, break; } + case s_req_schema_slash_slash_end: case s_req_host: + case s_req_port_start: case s_req_port: case s_req_path: case s_req_query_string_start: @@ -1919,15 +1930,23 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, uf = old_uf = UF_MAX; for (p = buf; p < buf + buflen; p++) { - if ((s = parse_url_char(s, *p, is_connect)) == s_dead) { - return 1; - } + s = parse_url_char(s, *p, is_connect); /* Figure out the next field that we're operating on */ switch (s) { - case s_req_schema: + case s_dead: + return 1; + + /* Skip delimeters */ case s_req_schema_slash: case s_req_schema_slash_slash: + case s_req_schema_slash_slash_end: + case s_req_port_start: + case s_req_query_string_start: + case s_req_fragment_start: + continue; + + case s_req_schema: uf = UF_SCHEMA; break; @@ -1943,12 +1962,10 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, uf = UF_PATH; break; - case s_req_query_string_start: case s_req_query_string: uf = UF_QUERY; break; - case s_req_fragment_start: case s_req_fragment: uf = UF_FRAGMENT; break; @@ -1964,23 +1981,8 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, continue; } - /* We ignore the first character in some fields; without this, we end up - * with the query being "?foo=bar" rather than "foo=bar". Callers probably - * don't want this. - */ - switch (uf) { - case UF_QUERY: - case UF_FRAGMENT: - case UF_PORT: - u->field_data[uf].off = p - buf + 1; - u->field_data[uf].len = 0; - break; - - default: - u->field_data[uf].off = p - buf; - u->field_data[uf].len = 1; - break; - } + u->field_data[uf].off = p - buf; + u->field_data[uf].len = 1; u->field_set |= (1 << uf); old_uf = uf; diff --git a/test.c b/test.c index a72ea09..3360fe0 100644 --- a/test.c +++ b/test.c @@ -1904,6 +1904,122 @@ test_preserve_data (void) } } +struct url_test { + const char *name; + const char *url; + int is_connect; + struct http_parser_url u; + int rv; +}; + +const struct url_test url_tests[] = +{ {.name="proxy request" + ,.url="http://hostname/" + ,.is_connect=0 + ,.u= + {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PATH) + ,.port=0 + ,.field_data= + {{ 0, 4 } /* UF_SCHEMA */ + ,{ 7, 8 } /* UF_HOST */ + ,{ 0, 0 } /* UF_PORT */ + ,{ 15, 1 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + } + } + ,.rv=0 + } + +, {.name="CONNECT request" + ,.url="hostname:443" + ,.is_connect=1 + ,.u= + {.field_set=(1 << UF_HOST) | (1 << UF_PORT) + ,.port=443 + ,.field_data= + {{ 0, 0 } /* UF_SCHEMA */ + ,{ 0, 8 } /* UF_HOST */ + ,{ 9, 3 } /* UF_PORT */ + ,{ 0, 0 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + } + } + ,.rv=0 + } +}; + +void +dump_url (const char *url, const struct http_parser_url *u) +{ + char part[512]; + unsigned int i; + + printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port); + for (i = 0; i < UF_MAX; i++) { + if ((u->field_set & (1 << i)) == 0) { + printf("\tfield_data[%u]: unset\n", i); + continue; + } + + memcpy(part, url + u->field_data[i].off, u->field_data[i].len); + part[u->field_data[i].len] = '\0'; + + printf("\tfield_data[%u]: off: %u len: %u part: \"%s\"\n", + i, + u->field_data[i].off, + u->field_data[i].len, + part); + } +} + +void +test_parse_url (void) +{ + struct http_parser_url u; + const struct url_test *test; + unsigned int i; + int rv; + + for (i = 0; i < (sizeof(url_tests) / sizeof(url_tests[0])); i++) { + test = &url_tests[i]; + memset(&u, 0, sizeof(u)); + + rv = http_parser_parse_url(test->url, + strlen(test->url), + test->is_connect, + &u); + + if (test->rv == 0) { + if (rv != 0) { + printf("\n*** http_parser_parse_url() \"%s\" test failed, " + "unexpected rv %d ***\n\n", test->name, rv); + exit(1); + } + + if (memcmp(&u, &test->u, sizeof(u)) != 0) { + printf("\n*** http_parser_parse_url(\"%s\") \"%s\" failed ***\n", + test->url, test->name); + + printf("target http_parser_url:\n"); + dump_url(test->url, &test->u); + printf("result http_parser_url:\n"); + dump_url(test->url, &u); + + exit(1); + } + } else { + /* test->rv != 0 */ + if (rv == 0) { + printf("\n*** http_parser_parse_url() \"%s\" test failed, " + "unexpected rv %d ***\n\n", test->name, rv); + exit(1); + } + } + } +} + void test_message (const struct message *message) { @@ -2412,6 +2528,7 @@ main (void) //// API test_preserve_data(); + test_parse_url(); //// OVERFLOW CONDITIONS