Fix http_parser_parse_url for urls like "http://host/path".

Before this change it would include the last slash in the separator between the
schema and host as part of the host. we cant use the trick used for skipping the
separator before ports, query strings, and fragments because if it was a CONNECT
style url string (host:port) it would skip the first character of the hostname.

Work around this by introducing a few more states to represent these separators
in a url differently to what theyre separating. this in turn lets us simplify
the url parsing so can simply skip what it considers delimiters rather than
having to special case certain types of url parts and skip their prefixes.

Add tests for the http_parser_parse_url().

This compares the http_parser_url struct that http_parser_parse_url()
produces against one that we expect from the test. If they differ
then http_parser_parse_url() misbehaved.
v0.10
David Gwynne 13 years ago committed by Ben Noordhuis
parent c3153bd1a9
commit 0499525110

@ -265,7 +265,9 @@ enum state
, s_req_schema , s_req_schema
, s_req_schema_slash , s_req_schema_slash
, s_req_schema_slash_slash , s_req_schema_slash_slash
, s_req_schema_slash_slash_end
, s_req_host , s_req_host
, s_req_port_start
, s_req_port , s_req_port
, s_req_path , s_req_path
, s_req_query_string_start , s_req_query_string_start
@ -448,19 +450,20 @@ parse_url_char(enum state s, const char ch, int is_connect)
case s_req_schema_slash_slash: case s_req_schema_slash_slash:
if (ch == '/') { if (ch == '/') {
return s_req_host; return s_req_schema_slash_slash_end;
} }
break; break;
case s_req_schema_slash_slash_end:
case s_req_host: case s_req_host:
if (IS_HOST_CHAR(ch)) { if (IS_HOST_CHAR(ch)) {
return s; return s_req_host;
} }
switch (ch) { switch (ch) {
case ':': case ':':
return s_req_port; return s_req_port_start;
case '/': case '/':
return s_req_path; return s_req_path;
@ -471,9 +474,10 @@ parse_url_char(enum state s, const char ch, int is_connect)
break; break;
case s_req_port_start:
case s_req_port: case s_req_port:
if (IS_NUM(ch)) { if (IS_NUM(ch)) {
return s; return s_req_port;
} }
switch (ch) { switch (ch) {
@ -613,17 +617,22 @@ size_t http_parser_execute (http_parser *parser,
header_field_mark = data; header_field_mark = data;
if (parser->state == s_header_value) if (parser->state == s_header_value)
header_value_mark = data; header_value_mark = data;
if (parser->state == s_req_path || switch (parser->state) {
parser->state == s_req_schema || case s_req_path:
parser->state == s_req_schema_slash || case s_req_schema:
parser->state == s_req_schema_slash_slash || case s_req_schema_slash:
parser->state == s_req_port || case s_req_schema_slash_slash:
parser->state == s_req_query_string_start || case s_req_schema_slash_slash_end:
parser->state == s_req_query_string || case s_req_port_start:
parser->state == s_req_host || case s_req_port:
parser->state == s_req_fragment_start || case s_req_query_string_start:
parser->state == s_req_fragment) case s_req_query_string:
case s_req_host:
case s_req_fragment_start:
case s_req_fragment:
url_mark = data; url_mark = data;
break;
}
for (p=data; p != data + len; p++) { for (p=data; p != data + len; p++) {
ch = *p; ch = *p;
@ -1000,7 +1009,9 @@ size_t http_parser_execute (http_parser *parser,
break; break;
} }
case s_req_schema_slash_slash_end:
case s_req_host: case s_req_host:
case s_req_port_start:
case s_req_port: case s_req_port:
case s_req_path: case s_req_path:
case s_req_query_string_start: case s_req_query_string_start:
@ -1919,15 +1930,23 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
uf = old_uf = UF_MAX; uf = old_uf = UF_MAX;
for (p = buf; p < buf + buflen; p++) { for (p = buf; p < buf + buflen; p++) {
if ((s = parse_url_char(s, *p, is_connect)) == s_dead) { s = parse_url_char(s, *p, is_connect);
return 1;
}
/* Figure out the next field that we're operating on */ /* Figure out the next field that we're operating on */
switch (s) { switch (s) {
case s_req_schema: case s_dead:
return 1;
/* Skip delimeters */
case s_req_schema_slash: case s_req_schema_slash:
case s_req_schema_slash_slash: case s_req_schema_slash_slash:
case s_req_schema_slash_slash_end:
case s_req_port_start:
case s_req_query_string_start:
case s_req_fragment_start:
continue;
case s_req_schema:
uf = UF_SCHEMA; uf = UF_SCHEMA;
break; break;
@ -1943,12 +1962,10 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
uf = UF_PATH; uf = UF_PATH;
break; break;
case s_req_query_string_start:
case s_req_query_string: case s_req_query_string:
uf = UF_QUERY; uf = UF_QUERY;
break; break;
case s_req_fragment_start:
case s_req_fragment: case s_req_fragment:
uf = UF_FRAGMENT; uf = UF_FRAGMENT;
break; break;
@ -1964,23 +1981,8 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
continue; continue;
} }
/* We ignore the first character in some fields; without this, we end up
* with the query being "?foo=bar" rather than "foo=bar". Callers probably
* don't want this.
*/
switch (uf) {
case UF_QUERY:
case UF_FRAGMENT:
case UF_PORT:
u->field_data[uf].off = p - buf + 1;
u->field_data[uf].len = 0;
break;
default:
u->field_data[uf].off = p - buf; u->field_data[uf].off = p - buf;
u->field_data[uf].len = 1; u->field_data[uf].len = 1;
break;
}
u->field_set |= (1 << uf); u->field_set |= (1 << uf);
old_uf = uf; old_uf = uf;

117
test.c

@ -1904,6 +1904,122 @@ test_preserve_data (void)
} }
} }
struct url_test {
const char *name;
const char *url;
int is_connect;
struct http_parser_url u;
int rv;
};
const struct url_test url_tests[] =
{ {.name="proxy request"
,.url="http://hostname/"
,.is_connect=0
,.u=
{.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PATH)
,.port=0
,.field_data=
{{ 0, 4 } /* UF_SCHEMA */
,{ 7, 8 } /* UF_HOST */
,{ 0, 0 } /* UF_PORT */
,{ 15, 1 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
}
}
,.rv=0
}
, {.name="CONNECT request"
,.url="hostname:443"
,.is_connect=1
,.u=
{.field_set=(1 << UF_HOST) | (1 << UF_PORT)
,.port=443
,.field_data=
{{ 0, 0 } /* UF_SCHEMA */
,{ 0, 8 } /* UF_HOST */
,{ 9, 3 } /* UF_PORT */
,{ 0, 0 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
}
}
,.rv=0
}
};
void
dump_url (const char *url, const struct http_parser_url *u)
{
char part[512];
unsigned int i;
printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port);
for (i = 0; i < UF_MAX; i++) {
if ((u->field_set & (1 << i)) == 0) {
printf("\tfield_data[%u]: unset\n", i);
continue;
}
memcpy(part, url + u->field_data[i].off, u->field_data[i].len);
part[u->field_data[i].len] = '\0';
printf("\tfield_data[%u]: off: %u len: %u part: \"%s\"\n",
i,
u->field_data[i].off,
u->field_data[i].len,
part);
}
}
void
test_parse_url (void)
{
struct http_parser_url u;
const struct url_test *test;
unsigned int i;
int rv;
for (i = 0; i < (sizeof(url_tests) / sizeof(url_tests[0])); i++) {
test = &url_tests[i];
memset(&u, 0, sizeof(u));
rv = http_parser_parse_url(test->url,
strlen(test->url),
test->is_connect,
&u);
if (test->rv == 0) {
if (rv != 0) {
printf("\n*** http_parser_parse_url() \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->name, rv);
exit(1);
}
if (memcmp(&u, &test->u, sizeof(u)) != 0) {
printf("\n*** http_parser_parse_url(\"%s\") \"%s\" failed ***\n",
test->url, test->name);
printf("target http_parser_url:\n");
dump_url(test->url, &test->u);
printf("result http_parser_url:\n");
dump_url(test->url, &u);
exit(1);
}
} else {
/* test->rv != 0 */
if (rv == 0) {
printf("\n*** http_parser_parse_url() \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->name, rv);
exit(1);
}
}
}
}
void void
test_message (const struct message *message) test_message (const struct message *message)
{ {
@ -2412,6 +2528,7 @@ main (void)
//// API //// API
test_preserve_data(); test_preserve_data();
test_parse_url();
//// OVERFLOW CONDITIONS //// OVERFLOW CONDITIONS

Loading…
Cancel
Save