diff --git a/.gitignore b/.gitignore index 8631bd8..3b868c5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ tags test test_g test_fast +url_parser *.mk *.Makefile *.so diff --git a/Makefile b/Makefile index 8d90f8d..d9bf839 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,12 @@ test_fast: http_parser.o test.o http_parser.h test.o: test.c http_parser.h Makefile $(CC) $(CPPFLAGS_FAST) $(CFLAGS_FAST) -c test.c -o $@ +url_parser: http_parser_g.o url_parser.o + $(CC) $(CFLAGS_DEBUG) $(LDFLAGS) http_parser_g.o url_parser.o -o $@ + +url_parser.o: url_parser.c http_parser.h Makefile + $(CC) $(CPPFLAGS_DEBUG) $(CFLAGS_DEBUG) -c url_parser.c -o $@ + http_parser.o: http_parser.c http_parser.h Makefile $(CC) $(CPPFLAGS_FAST) $(CFLAGS_FAST) -c http_parser.c @@ -53,6 +59,6 @@ tags: http_parser.c http_parser.h test.c ctags $^ clean: - rm -f *.o *.a test test_fast test_g http_parser.tar tags libhttp_parser.so libhttp_parser.o + rm -f *.o *.a test test_fast test_g url_parser http_parser.tar tags libhttp_parser.so libhttp_parser.o .PHONY: clean package test-run test-run-timed test-valgrind diff --git a/http_parser.c b/http_parser.c index 0c11eb8..8122376 100644 --- a/http_parser.c +++ b/http_parser.c @@ -253,13 +253,9 @@ enum state , s_req_schema , s_req_schema_slash , s_req_schema_slash_slash - , s_req_host_start - , s_req_host_v6_start - , s_req_host_v6 - , s_req_host_v6_end - , s_req_host - , s_req_port_start - , s_req_port + , s_req_server_start + , s_req_server + , s_req_server_with_at , s_req_path , s_req_query_string_start , s_req_query_string @@ -337,6 +333,19 @@ enum header_states , h_connection_close }; +enum http_host_state + { + s_http_host_dead = 1 + , s_http_userinfo_start + , s_http_userinfo + , s_http_host_start + , s_http_host_v6_start + , s_http_host + , s_http_host_v6 + , s_http_host_v6_end + , s_http_host_port_start + , s_http_host_port +}; /* Macros for character classes; depends on strict-mode */ #define CR '\r' @@ -346,6 +355,12 @@ enum header_states #define IS_NUM(c) ((c) >= '0' && (c) <= '9') #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c)) #define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f')) +#define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \ + (c) == '!' || (c) == '~' || (c) == '*' || (c) == '\'' || (c) == '(' || \ + (c) == ')') +#define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \ + (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \ + (c) == '$' || (c) == ',') #if HTTP_PARSER_STRICT #define TOKEN(c) (tokens[(unsigned char)c]) @@ -450,67 +465,33 @@ parse_url_char(enum state s, const char ch) case s_req_schema_slash_slash: if (ch == '/') { - return s_req_host_start; + return s_req_server_start; } break; - case s_req_host_start: - if (ch == '[') { - return s_req_host_v6_start; - } - - if (IS_HOST_CHAR(ch)) { - return s_req_host; - } - - break; - - case s_req_host: - if (IS_HOST_CHAR(ch)) { - return s_req_host; - } - - /* FALLTHROUGH */ - case s_req_host_v6_end: - switch (ch) { - case ':': - return s_req_port_start; - - case '/': - return s_req_path; - - case '?': - return s_req_query_string_start; + case s_req_server_with_at: + if (ch == '@') { + return s_dead; } - break; - - case s_req_host_v6: - if (ch == ']') { - return s_req_host_v6_end; + /* FALLTHROUGH */ + case s_req_server_start: + case s_req_server: + if (ch == '/') { + return s_req_path; } - /* FALLTHROUGH */ - case s_req_host_v6_start: - if (IS_HEX(ch) || ch == ':') { - return s_req_host_v6; + if (ch == '?') { + return s_req_query_string_start; } - break; - - case s_req_port: - switch (ch) { - case '/': - return s_req_path; - case '?': - return s_req_query_string_start; + if (ch == '@') { + return s_req_server_with_at; } - /* FALLTHROUGH */ - case s_req_port_start: - if (IS_NUM(ch)) { - return s_req_port; + if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') { + return s_req_server; } break; @@ -632,13 +613,9 @@ size_t http_parser_execute (http_parser *parser, case s_req_schema: case s_req_schema_slash: case s_req_schema_slash_slash: - case s_req_host_start: - case s_req_host_v6_start: - case s_req_host_v6: - case s_req_host_v6_end: - case s_req_host: - case s_req_port_start: - case s_req_port: + case s_req_server_start: + case s_req_server: + case s_req_server_with_at: case s_req_query_string_start: case s_req_query_string: case s_req_fragment_start: @@ -999,7 +976,7 @@ size_t http_parser_execute (http_parser *parser, MARK(url); if (parser->method == HTTP_CONNECT) { - parser->state = s_req_host_start; + parser->state = s_req_server_start; } parser->state = parse_url_char((enum state)parser->state, ch); @@ -1014,10 +991,7 @@ size_t http_parser_execute (http_parser *parser, case s_req_schema: case s_req_schema_slash: case s_req_schema_slash_slash: - case s_req_host_start: - case s_req_host_v6_start: - case s_req_host_v6: - case s_req_port_start: + case s_req_server_start: { switch (ch) { /* No whitespace allowed here */ @@ -1037,9 +1011,8 @@ size_t http_parser_execute (http_parser *parser, break; } - case s_req_host: - case s_req_host_v6_end: - case s_req_port: + case s_req_server: + case s_req_server_with_at: case s_req_path: case s_req_query_string_start: case s_req_query_string: @@ -1938,6 +1911,144 @@ http_errno_description(enum http_errno err) { return http_strerror_tab[err].description; } +static enum http_host_state +http_parse_host_char(enum http_host_state s, const char ch) { + switch(s) { + case s_http_userinfo: + case s_http_userinfo_start: + if (ch == '@') { + return s_http_host_start; + } + + if (IS_USERINFO_CHAR(ch)) { + return s_http_userinfo; + } + break; + + case s_http_host_start: + if (ch == '[') { + return s_http_host_v6_start; + } + + if (IS_HOST_CHAR(ch)) { + return s_http_host; + } + + break; + + case s_http_host: + if (IS_HOST_CHAR(ch)) { + return s_http_host; + } + + /* FALLTHROUGH */ + case s_http_host_v6_end: + if (ch == ':') { + return s_http_host_port_start; + } + + break; + + case s_http_host_v6: + if (ch == ']') { + return s_http_host_v6_end; + } + + /* FALLTHROUGH */ + case s_http_host_v6_start: + if (IS_HEX(ch) || ch == ':') { + return s_http_host_v6; + } + + break; + + case s_http_host_port: + case s_http_host_port_start: + if (IS_NUM(ch)) { + return s_http_host_port; + } + + break; + + default: + break; + } + return s_http_host_dead; +} + +static int +http_parse_host(const char * buf, struct http_parser_url *u, int found_at) { + enum http_host_state s; + + const char *p; + size_t buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len; + + u->field_data[UF_HOST].len = 0; + + s = found_at ? s_http_userinfo_start : s_http_host_start; + + for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) { + enum http_host_state new_s = http_parse_host_char(s, *p); + + if (new_s == s_http_host_dead) { + return 1; + } + + switch(new_s) { + case s_http_host: + if (s != s_http_host) { + u->field_data[UF_HOST].off = p - buf; + } + u->field_data[UF_HOST].len++; + break; + + case s_http_host_v6: + if (s != s_http_host_v6) { + u->field_data[UF_HOST].off = p - buf; + } + u->field_data[UF_HOST].len++; + break; + + case s_http_host_port: + if (s != s_http_host_port) { + u->field_data[UF_PORT].off = p - buf; + u->field_data[UF_PORT].len = 0; + u->field_set |= (1 << UF_PORT); + } + u->field_data[UF_PORT].len++; + break; + + case s_http_userinfo: + if (s != s_http_userinfo) { + u->field_data[UF_USERINFO].off = p - buf ; + u->field_data[UF_USERINFO].len = 0; + u->field_set |= (1 << UF_USERINFO); + } + u->field_data[UF_USERINFO].len++; + break; + + default: + break; + } + s = new_s; + } + + /* Make sure we don't end somewhere unexpected */ + switch (s) { + case s_http_host_start: + case s_http_host_v6_start: + case s_http_host_v6: + case s_http_host_port_start: + case s_http_userinfo: + case s_http_userinfo_start: + return 1; + default: + break; + } + + return 0; +} + int http_parser_parse_url(const char *buf, size_t buflen, int is_connect, struct http_parser_url *u) @@ -1945,9 +2056,10 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, enum state s; const char *p; enum http_parser_url_fields uf, old_uf; + int found_at = 0; u->port = u->field_set = 0; - s = is_connect ? s_req_host_start : s_req_spaces_before_url; + s = is_connect ? s_req_server_start : s_req_spaces_before_url; uf = old_uf = UF_MAX; for (p = buf; p < buf + buflen; p++) { @@ -1961,10 +2073,7 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, /* Skip delimeters */ case s_req_schema_slash: case s_req_schema_slash_slash: - case s_req_host_start: - case s_req_host_v6_start: - case s_req_host_v6_end: - case s_req_port_start: + case s_req_server_start: case s_req_query_string_start: case s_req_fragment_start: continue; @@ -1973,13 +2082,12 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, uf = UF_SCHEMA; break; - case s_req_host: - case s_req_host_v6: - uf = UF_HOST; - break; + case s_req_server_with_at: + found_at = 1; - case s_req_port: - uf = UF_PORT; + /* FALLTROUGH */ + case s_req_server: + uf = UF_HOST; break; case s_req_path: @@ -2012,21 +2120,17 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect, old_uf = uf; } - /* CONNECT requests can only contain "hostname:port" */ - if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) { - return 1; + /* host must be present if there is a schema */ + /* parsing http:///toto will fail */ + if ((u->field_set & ((1 << UF_SCHEMA) | (1 << UF_HOST))) != 0) { + if (http_parse_host(buf, u, found_at) != 0) { + return 1; + } } - /* Make sure we don't end somewhere unexpected */ - switch (s) { - case s_req_host_v6_start: - case s_req_host_v6: - case s_req_host_v6_end: - case s_req_host: - case s_req_port_start: + /* CONNECT requests can only contain "hostname:port" */ + if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) { return 1; - default: - break; } if (u->field_set & (1 << UF_PORT)) { diff --git a/http_parser.h b/http_parser.h index 7a36259..575e1be 100644 --- a/http_parser.h +++ b/http_parser.h @@ -256,7 +256,8 @@ enum http_parser_url_fields , UF_PATH = 3 , UF_QUERY = 4 , UF_FRAGMENT = 5 - , UF_MAX = 6 + , UF_USERINFO = 6 + , UF_MAX = 7 }; diff --git a/test.c b/test.c index 6d8c004..1ca8f05 100644 --- a/test.c +++ b/test.c @@ -50,6 +50,8 @@ struct message { char query_string[MAX_ELEMENT_SIZE]; char body[MAX_ELEMENT_SIZE]; size_t body_size; + const char *host; + const char *userinfo; uint16_t port; int num_headers; enum { NONE=0, FIELD, VALUE } last_header_element; @@ -630,6 +632,7 @@ const struct message requests[] = ,.fragment= "" ,.request_path= "" ,.request_url= "http://hypnotoad.org?hail=all" + ,.host= "hypnotoad.org" ,.num_headers= 0 ,.headers= { } ,.body= "" @@ -649,6 +652,7 @@ const struct message requests[] = ,.fragment= "" ,.request_path= "" ,.request_url= "http://hypnotoad.org:1234?hail=all" + ,.host= "hypnotoad.org" ,.port= 1234 ,.num_headers= 0 ,.headers= { } @@ -669,6 +673,7 @@ const struct message requests[] = ,.fragment= "" ,.request_path= "" ,.request_url= "http://hypnotoad.org:1234" + ,.host= "hypnotoad.org" ,.port= 1234 ,.num_headers= 0 ,.headers= { } @@ -870,6 +875,28 @@ const struct message requests[] = ,.body= "" } +#define PROXY_WITH_BASIC_AUTH 33 +, {.name= "host:port and basic_auth" + ,.type= HTTP_REQUEST + ,.raw= "GET http://a%12:b!&*$@hypnotoad.org:1234/toto HTTP/1.1\r\n" + "\r\n" + ,.should_keep_alive= TRUE + ,.message_complete_on_eof= FALSE + ,.http_major= 1 + ,.http_minor= 1 + ,.method= HTTP_GET + ,.fragment= "" + ,.request_path= "/toto" + ,.request_url= "http://a%12:b!&*$@hypnotoad.org:1234/toto" + ,.host= "hypnotoad.org" + ,.userinfo= "a%12:b!&*$" + ,.port= 1234 + ,.num_headers= 0 + ,.headers= { } + ,.body= "" + } + + , {.name= NULL } /* sentinel */ }; @@ -1794,6 +1821,14 @@ message_eq (int index, const struct message *expected) abort(); } + if (expected->host) { + MESSAGE_CHECK_URL_EQ(&u, expected, m, host, UF_HOST); + } + + if (expected->userinfo) { + MESSAGE_CHECK_URL_EQ(&u, expected, m, userinfo, UF_USERINFO); + } + m->port = (u.field_set & (1 << UF_PORT)) ? u.port : 0; @@ -1966,6 +2001,26 @@ const struct url_test url_tests[] = ,{ 15, 1 } /* UF_PATH */ ,{ 0, 0 } /* UF_QUERY */ ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ + } + } + ,.rv=0 + } + +, {.name="proxy request with port" + ,.url="http://hostname:444/" + ,.is_connect=0 + ,.u= + {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PORT) | (1 << UF_PATH) + ,.port=444 + ,.field_data= + {{ 0, 4 } /* UF_SCHEMA */ + ,{ 7, 8 } /* UF_HOST */ + ,{ 16, 3 } /* UF_PORT */ + ,{ 19, 1 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ } } ,.rv=0 @@ -1984,11 +2039,18 @@ const struct url_test url_tests[] = ,{ 0, 0 } /* UF_PATH */ ,{ 0, 0 } /* UF_QUERY */ ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ } } ,.rv=0 } +, {.name="CONNECT request but not connect" + ,.url="hostname:443" + ,.is_connect=0 + ,.rv=1 + } + , {.name="proxy ipv6 request" ,.url="http://[1:2::3:4]/" ,.is_connect=0 @@ -2002,6 +2064,26 @@ const struct url_test url_tests[] = ,{ 17, 1 } /* UF_PATH */ ,{ 0, 0 } /* UF_QUERY */ ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ + } + } + ,.rv=0 + } + +, {.name="proxy ipv6 request with port" + ,.url="http://[1:2::3:4]:67/" + ,.is_connect=0 + ,.u= + {.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PORT) | (1 << UF_PATH) + ,.port=67 + ,.field_data= + {{ 0, 4 } /* UF_SCHEMA */ + ,{ 8, 8 } /* UF_HOST */ + ,{ 18, 2 } /* UF_PORT */ + ,{ 20, 1 } /* UF_PATH */ + ,{ 0, 0 } /* UF_QUERY */ + ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ } } ,.rv=0 @@ -2020,13 +2102,16 @@ const struct url_test url_tests[] = ,{ 0, 0 } /* UF_PATH */ ,{ 0, 0 } /* UF_QUERY */ ,{ 0, 0 } /* UF_FRAGMENT */ + ,{ 0, 0 } /* UF_USERINFO */ } } ,.rv=0 } , {.name="extra ? in query string" - ,.url="http://a.tbcdn.cn/p/fp/2010c/??fp-header-min.css,fp-base-min.css,fp-channel-min.css,fp-product-min.css,fp-mall-min.css,fp-category-min.css,fp-sub-min.css,fp-gdp4p-min.css,fp-css3-min.css,fp-misc-min.css?t=20101022.css" + ,.url="http://a.tbcdn.cn/p/fp/2010c/??fp-header-min.css,fp-base-min.css," + "fp-channel-min.css,fp-product-min.css,fp-mall-min.css,fp-category-min.css," + "fp-sub-min.css,fp-gdp4p-min.css,fp-css3-min.css,fp-misc-min.css?t=20101022.css" ,.is_connect=0 ,.u= {.field_set=(1<field_set, u->port); @@ -2149,14 +2462,12 @@ dump_url (const char *url, const struct http_parser_url *u) continue; } - memcpy(part, url + u->field_data[i].off, u->field_data[i].len); - part[u->field_data[i].len] = '\0'; - - printf("\tfield_data[%u]: off: %u len: %u part: \"%s\"\n", + printf("\tfield_data[%u]: off: %u len: %u part: \"%.*s\n", i, u->field_data[i].off, u->field_data[i].len, - part); + u->field_data[i].len, + url + u->field_data[i].off); } } diff --git a/url_parser.c b/url_parser.c new file mode 100644 index 0000000..b1f9c97 --- /dev/null +++ b/url_parser.c @@ -0,0 +1,44 @@ +#include "http_parser.h" +#include +#include + +void +dump_url (const char *url, const struct http_parser_url *u) +{ + unsigned int i; + + printf("\tfield_set: 0x%x, port: %u\n", u->field_set, u->port); + for (i = 0; i < UF_MAX; i++) { + if ((u->field_set & (1 << i)) == 0) { + printf("\tfield_data[%u]: unset\n", i); + continue; + } + + printf("\tfield_data[%u]: off: %u len: %u part: \"%.*s\n", + i, + u->field_data[i].off, + u->field_data[i].len, + u->field_data[i].len, + url + u->field_data[i].off); + } +} + +int main(int argc, char ** argv) { + if (argc != 3) { + printf("Syntax : %s connect|get url\n", argv[0]); + return 1; + } + struct http_parser_url u; + int len = strlen(argv[2]); + int connect = strcmp("connect", argv[1]) == 0 ? 1 : 0; + printf("Parsing %s, connect %d\n", argv[2], connect); + + int result = http_parser_parse_url(argv[2], len, connect, &u); + if (result != 0) { + printf("Parse error : %d\n", result); + return result; + } + printf("Parse ok, result : \n"); + dump_url(argv[2], &u); + return 0; +} \ No newline at end of file