From 50b9bec552ef6ff6ecb8d0e26fcb4da30ac62934 Mon Sep 17 00:00:00 2001 From: Peter Griess Date: Wed, 11 May 2011 10:48:22 -0500 Subject: [PATCH] Allow octets > 127 in path components. - This is non-spec behavior, but it appears that most HTTP servers implicitly support non-ASCII characters when parsing path components. Extend http-parser to allow this. - Fill out slots [128, 256) in normal_url_char[] with 1 so that these high octets are accepted in path components. - Add unit test for paths that include such non-ASCII characters. Fixes #37. --- http_parser.c | 23 ++++++++++++++++++++++- test.c | 23 ++++++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/http_parser.c b/http_parser.c index 25d7a51..4a0129b 100644 --- a/http_parser.c +++ b/http_parser.c @@ -186,7 +186,28 @@ static const uint8_t normal_url_char[256] = { /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */ 1, 1, 1, 1, 1, 1, 1, 1, /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */ - 1, 1, 1, 1, 1, 1, 1, 0 }; + 1, 1, 1, 1, 1, 1, 1, 0, + +/* Remainder of non-ASCII range are accepted as-is to support implicitly UTF-8 + encoded paths. This is out of spec, but clients generate this and most other + HTTP servers support it. We should, too. */ + + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 }; enum state diff --git a/test.c b/test.c index 76cc8b7..90234f0 100644 --- a/test.c +++ b/test.c @@ -557,7 +557,7 @@ const struct message requests[] = ,.body= "" } -#define MSEARCH_REQ 19 +#define MSEARCH_REQ 20 , {.name= "m-search request" ,.type= HTTP_REQUEST ,.raw= "M-SEARCH * HTTP/1.1\r\n" @@ -582,6 +582,27 @@ const struct message requests[] = ,.body= "" } +#define UTF8_PATH_REQ 21 +, {.name= "utf-8 path request" + ,.type= HTTP_REQUEST + ,.raw= "GET /δ¶/δt/pope?q=1#narf HTTP/1.1\r\n" + "Host: github.com\r\n" + "\r\n" + ,.should_keep_alive= TRUE + ,.message_complete_on_eof= FALSE + ,.http_major= 1 + ,.http_minor= 1 + ,.method= HTTP_GET + ,.query_string= "q=1" + ,.fragment= "narf" + ,.request_path= "/δ¶/δt/pope" + ,.request_url= "/δ¶/δt/pope?q=1#narf" + ,.num_headers= 1 + ,.headers= { {"Host", "github.com" } + } + ,.body= "" + } + , {.name= NULL } /* sentinel */ };