From 5aba2a585033945fe8c7d58c27eb7eb47a391eb9 Mon Sep 17 00:00:00 2001 From: hniksic Date: Fri, 6 May 2005 17:34:45 -0700 Subject: [PATCH] [svn] Don't needlessly decode % escapes in URLs. --- src/ChangeLog | 5 +++ src/url.c | 112 +++++++++++++++++++++----------------------------- 2 files changed, 52 insertions(+), 65 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 17d53ff6..d5044c72 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +2005-05-07 Hrvoje Niksic + + * url.c (decide_copy_method): Never cause reencode_escapes to + decode % escapes; it is too intrusive and breaks some servers. + 2005-05-07 Hrvoje Niksic * http.c (gethttp): When tunnelling SSL traffic over proxy with diff --git a/src/url.c b/src/url.c index f77d8ad9..ad8fc202 100644 --- a/src/url.c +++ b/src/url.c @@ -255,34 +255,29 @@ url_escape_allow_passthrough (const char *s) return url_escape_1 (s, urlchr_unsafe, 1); } -enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH }; +enum copy_method { cm_encode, cm_passthrough }; + +/* Decide whether to encode or pass through the char at P. This used + to be a macro, but it got a little too convoluted. */ -/* Decide whether to encode, decode, or pass through the char at P. - This used to be a macro, but it got a little too convoluted. */ static inline enum copy_method decide_copy_method (const char *p) { if (*p == '%') { if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2))) - { - /* %xx sequence: decode it, unless it would decode to an - unsafe or a reserved char; in that case, leave it as - is. */ - char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2)); - if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt)) - return CM_PASSTHROUGH; - else - return CM_DECODE; - } + /* Prior to 1.10 this decoded %HH escapes corresponding to + "safe" chars, but that proved too obtrusive -- it's better + to always preserve the escapes found in the URL. */ + return cm_passthrough; else /* Garbled %.. sequence: encode `%'. */ - return CM_ENCODE; + return cm_encode; } else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) - return CM_ENCODE; + return cm_encode; else - return CM_PASSTHROUGH; + return cm_passthrough; } /* Translate a %-escaped (but possibly non-conformant) input string S @@ -292,16 +287,15 @@ decide_copy_method (const char *p) After a URL has been run through this function, the protocols that use `%' as the quote character can use the resulting string as-is, - while those that don't call url_unescape() to get to the intended - data. This function is also stable: after an input string is - transformed the first time, all further transformations of the - result yield the same result string. + while those that don't can use url_unescape to get to the intended + data. This function is stable: once the input is transformed, + further transformations of the result yield the same output. Let's discuss why this function is needed. - Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw - space character would mess up the HTTP request, it needs to be - quoted, like this: + Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since + a raw space character would mess up the HTTP request, it needs to + be quoted, like this: GET /abc%20def HTTP/1.0 @@ -312,14 +306,15 @@ decide_copy_method (const char *p) part of URL syntax, "%20" is the correct way to denote a literal space on the Wget command line. This leaves us in the conclusion that in that case Wget should not call url_escape, but leave the - `%20' as is. + `%20' as is. This is clearly contradictory, but it only gets + worse. - And what if the requested URI is `abc%20 def'? If we call - url_escape, we end up with `/abc%2520%20def', which is almost - certainly not intended. If we don't call url_escape, we are left - with the embedded space and cannot complete the request. What the - user meant was for Wget to request `/abc%20%20def', and this is - where reencode_escapes kicks in. + What if the requested URI is `abc%20 def'? If we call url_escape, + we end up with `/abc%2520%20def', which is almost certainly not + intended. If we don't call url_escape, we are left with the + embedded space and cannot complete the request. What the user + meant was for Wget to request `/abc%20%20def', and this is where + reencode_escapes kicks in. Wget used to solve this by first decoding %-quotes, and then encoding all the "unsafe" characters found in the resulting string. @@ -333,25 +328,25 @@ decide_copy_method (const char *p) literal plus. reencode_escapes correctly translates the above to "a%2B+b", i.e. returns the original string. - This function uses an algorithm proposed by Anon Sricharoenchai: + This function uses a modified version of the algorithm originally + proposed by Anon Sricharoenchai: - 1. Encode all URL_UNSAFE and the "%" that are not followed by 2 - hexdigits. + * Encode all "unsafe" characters, except those that are also + "reserved", to %XX. See urlchr_table for which characters are + unsafe and reserved. - 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and - "+". + * Encode the "%" characters not followed by two hex digits to + "%25". - ...except that this code conflates the two steps, and decides - whether to encode, decode, or pass through each character in turn. - The function still uses two passes, but their logic is the same -- - the first pass exists merely for the sake of allocation. Another - small difference is that we include `+' to URL_RESERVED. + * Pass through all other characters and %XX escapes as-is. (Up to + Wget 1.10 this decoded %XX escapes corresponding to "safe" + characters, but that was obtrusive and broke some servers.) Anon's test case: "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc" -> - "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc" + "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc" Simpler test cases: @@ -372,7 +367,6 @@ reencode_escapes (const char *s) int oldlen, newlen; int encode_count = 0; - int decode_count = 0; /* First, pass through the string to see if there's anything to do, and to calculate the new length. */ @@ -380,25 +374,21 @@ reencode_escapes (const char *s) { switch (decide_copy_method (p1)) { - case CM_ENCODE: + case cm_encode: ++encode_count; break; - case CM_DECODE: - ++decode_count; - break; - case CM_PASSTHROUGH: + case cm_passthrough: break; } } - if (!encode_count && !decode_count) + if (!encode_count) /* The string is good as it is. */ - return (char *)s; /* C const model sucks. */ + return (char *) s; /* C const model sucks. */ oldlen = p1 - s; - /* Each encoding adds two characters (hex digits), while each - decoding removes two characters. */ - newlen = oldlen + 2 * (encode_count - decode_count); + /* Each encoding adds two characters (hex digits). */ + newlen = oldlen + 2 * encode_count; newstr = xmalloc (newlen + 1); p1 = s; @@ -408,7 +398,7 @@ reencode_escapes (const char *s) { switch (decide_copy_method (p1)) { - case CM_ENCODE: + case cm_encode: { unsigned char c = *p1++; *p2++ = '%'; @@ -416,11 +406,7 @@ reencode_escapes (const char *s) *p2++ = XNUM_TO_DIGIT (c & 0xf); } break; - case CM_DECODE: - *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]); - p1 += 3; /* skip %xx */ - break; - case CM_PASSTHROUGH: + case cm_passthrough: *p2++ = *p1++; } } @@ -869,8 +855,9 @@ url_parse (const char *url, int *error) host_modified = lowercase_str (u->host); /* Decode %HH sequences in host name. This is important not so much - to support %HH sequences, but to support binary characters (which - will have been converted to %HH by reencode_escapes). */ + to support %HH sequences in host names (which other browser + don't), but to support binary characters (which will have been + converted to %HH by reencode_escapes). */ if (strchr (u->host, '%')) { url_unescape (u->host); @@ -1539,11 +1526,6 @@ url_file_name (const struct url *u) "back up one element". Single leading and trailing slashes are preserved. - This function does not handle URL escapes explicitly. If you're - passing paths from URLs, make sure to unquote "%2e" and "%2E" to - ".", so that this function can find the dots. (Wget's URL parser - calls reencode_escapes, which see.) - For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive test examples are provided below. If you change anything in this function, run test_path_simplify to make sure you haven't broken a