1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Preserve consecutive slashes in URLs. Make sure leading ".."'s are

not stripped away.
This commit is contained in:
hniksic 2003-11-13 17:36:02 -08:00
parent 9d907933ad
commit 6ddd0006e4
2 changed files with 94 additions and 108 deletions

View File

@ -1,3 +1,14 @@
2003-11-14 Hrvoje Niksic <hniksic@xemacs.org>
* url.c (uri_merge): Merging "foo" and "bar" should result in
"bar", not in "foo/bar".
(path_simplify): Don't remove empty path elements; don't
special-case leading slash.
(path_simplify): Don't swallow ".."'s at the beginning of string.
E.g. simplify "foo/../../bar" as "../bar", not as "bar".
(append_uri_pathel): Defang ".." path element upon encountering
it.
2003-11-13 Hrvoje Niksic <hniksic@xemacs.org> 2003-11-13 Hrvoje Niksic <hniksic@xemacs.org>
* http.c (persistent_available_p): Don't attempt to talk to two * http.c (persistent_available_p): Don't attempt to talk to two

191
src/url.c
View File

@ -1469,22 +1469,31 @@ append_uri_pathel (const char *b, const char *e, int escaped_p,
e = unescaped + strlen (unescaped); e = unescaped + strlen (unescaped);
} }
/* Defang ".." when found as component of path. Remember that path
comes from the URL and might contain malicious input. */
if (e - b == 2 && b[0] == '.' && b[1] == '.')
{
b = "%2E%2E";
e = b + 6;
}
/* Walk the PATHEL string and check how many characters we'll need /* Walk the PATHEL string and check how many characters we'll need
to add for file quoting. */ to quote. */
quoted = 0; quoted = 0;
for (p = b; p < e; p++) for (p = b; p < e; p++)
if (FILE_CHAR_TEST (*p, mask)) if (FILE_CHAR_TEST (*p, mask))
++quoted; ++quoted;
/* e-b is the string length. Each quoted char means two additional /* Calculate the length of the output string. e-b is the input
string length. Each quoted char introduces two additional
characters in the string, hence 2*quoted. */ characters in the string, hence 2*quoted. */
outlen = (e - b) + (2 * quoted); outlen = (e - b) + (2 * quoted);
GROW (dest, outlen); GROW (dest, outlen);
if (!quoted) if (!quoted)
{ {
/* If there's nothing to quote, we don't need to go through the /* If there's nothing to quote, we can simply append the string
string the second time. */ without processing it again. */
memcpy (TAIL (dest), b, outlen); memcpy (TAIL (dest), b, outlen);
} }
else else
@ -1623,29 +1632,6 @@ url_file_name (const struct url *u)
xfree (fname); xfree (fname);
return unique; return unique;
} }
/* Return the length of URL's path. Path is considered to be
terminated by one of '?', ';', '#', or by the end of the
string. */
static int
path_length (const char *url)
{
const char *q = strpbrk_or_eos (url, "?;#");
return q - url;
}
/* Find the last occurrence of character C in the range [b, e), or
NULL, if none are present. This is equivalent to strrchr(b, c),
except that it accepts an END argument instead of requiring the
string to be zero-terminated. Why is there no memrchr()? */
static const char *
find_last_char (const char *b, const char *e, char c)
{
for (; e > b; e--)
if (*e == c)
return e;
return NULL;
}
/* Resolve "." and ".." elements of PATH by destructively modifying /* Resolve "." and ".." elements of PATH by destructively modifying
PATH and return non-zero if PATH has been modified, zero otherwise. PATH and return non-zero if PATH has been modified, zero otherwise.
@ -1669,15 +1655,10 @@ find_last_char (const char *b, const char *e, char c)
static int static int
path_simplify (char *path) path_simplify (char *path)
{ {
char *h, *t, *end; char *h = path; /* hare */
char *t = path; /* tortoise */
/* Preserve the leading '/'. */ char *beg = path; /* boundary for backing the tortoise */
if (path[0] == '/') char *end = path + strlen (path);
++path;
h = path; /* hare */
t = path; /* tortoise */
end = path + strlen (path);
while (h < end) while (h < end)
{ {
@ -1691,28 +1672,27 @@ path_simplify (char *path)
else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
{ {
/* Handle "../" by retreating the tortoise by one path /* Handle "../" by retreating the tortoise by one path
element -- but not past beggining of PATH. */ element -- but not past beggining. */
if (t > path) if (t > beg)
{ {
/* Move backwards until T hits the beginning of the /* Move backwards until T hits the beginning of the
previous path element or the beginning of path. */ previous path element or the beginning of path. */
for (--t; t > path && t[-1] != '/'; t--) for (--t; t > beg && t[-1] != '/'; t--)
; ;
} }
else
{
/* If we're at the beginning, copy the "../" literally
move the beginning so a later ".." doesn't remove
it. */
beg = t + 3;
goto regular;
}
h += 3; h += 3;
} }
else if (*h == '/')
{
/* Ignore empty path elements. Supporting them well is hard
(where do you save "http://x.com///y.html"?), and they
don't bring any practical gain. Plus, they break our
filesystem-influenced assumptions: allowing them would
make "x/y//../z" simplify to "x/y/z", whereas most people
would expect "x/z". */
++h;
}
else else
{ {
regular:
/* A regular path element. If H hasn't advanced past T, /* A regular path element. If H hasn't advanced past T,
simply skip to the next path element. Otherwise, copy simply skip to the next path element. Otherwise, copy
the path element until the next slash. */ the path element until the next slash. */
@ -1741,6 +1721,30 @@ path_simplify (char *path)
return t != h; return t != h;
} }
/* Return the length of URL's path. Path is considered to be
terminated by one of '?', ';', '#', or by the end of the
string. */
static int
path_length (const char *url)
{
const char *q = strpbrk_or_eos (url, "?;#");
return q - url;
}
/* Find the last occurrence of character C in the range [b, e), or
NULL, if none are present. We might want to use memrchr (a GNU
extension) under GNU libc. */
static const char *
find_last_char (const char *b, const char *e, char c)
{
for (; e > b; e--)
if (*e == c)
return e;
return NULL;
}
/* Merge BASE with LINK and return the resulting URI. /* Merge BASE with LINK and return the resulting URI.
Either of the URIs may be absolute or relative, complete with the Either of the URIs may be absolute or relative, complete with the
@ -1748,8 +1752,10 @@ path_simplify (char *path)
foreseeable cases. It only employs minimal URL parsing, without foreseeable cases. It only employs minimal URL parsing, without
knowledge of the specifics of schemes. knowledge of the specifics of schemes.
Perhaps this function should call path_simplify so that the callers I briefly considered making this function call path_simplify after
don't have to call url_parse unconditionally. */ the merging process, as rfc1738 seems to suggest. This is a bad
idea for several reasons: 1) it complexifies the code, and 2)
url_parse has to simplify path anyway, so it's wasteful to boot. */
char * char *
uri_merge (const char *base, const char *link) uri_merge (const char *base, const char *link)
@ -1899,24 +1905,8 @@ uri_merge (const char *base, const char *link)
const char *last_slash = find_last_char (base, end, '/'); const char *last_slash = find_last_char (base, end, '/');
if (!last_slash) if (!last_slash)
{ {
/* No slash found at all. Append LINK to what we have, /* No slash found at all. Replace what we have with LINK. */
but we'll need a slash as a separator. start_insert = base;
Example: if base == "foo" and link == "qux/xyzzy", then
we cannot just append link to base, because we'd get
"fooqux/xyzzy", whereas what we want is
"foo/qux/xyzzy".
To make sure the / gets inserted, we set
need_explicit_slash to 1. We also set start_insert
to end + 1, so that the length calculations work out
correctly for one more (slash) character. Accessing
that character is fine, since it will be the
delimiter, '\0' or '?'. */
/* example: "foo?..." */
/* ^ ('?' gets changed to '/') */
start_insert = end + 1;
need_explicit_slash = 1;
} }
else if (last_slash && last_slash >= base + 2 else if (last_slash && last_slash >= base + 2
&& last_slash[-2] == ':' && last_slash[-1] == '/') && last_slash[-2] == ':' && last_slash[-1] == '/')
@ -2095,10 +2085,10 @@ run_test (char *test, char *expected_result, int expected_change)
if (modified != expected_change) if (modified != expected_change)
{ {
if (expected_change == 1) if (expected_change == 1)
printf ("Expected no modification with path_simplify(\"%s\").\n", printf ("Expected modification with path_simplify(\"%s\").\n",
test); test);
else else
printf ("Expected modification with path_simplify(\"%s\").\n", printf ("Expected no modification with path_simplify(\"%s\").\n",
test); test);
} }
xfree (test_copy); xfree (test_copy);
@ -2111,24 +2101,28 @@ test_path_simplify (void)
char *test, *result; char *test, *result;
int should_modify; int should_modify;
} tests[] = { } tests[] = {
{ "", "", 0 }, { "", "", 0 },
{ ".", "", 1 }, { ".", "", 1 },
{ "..", "", 1 }, { "./", "", 1 },
{ "foo", "foo", 0 }, { "..", "..", 0 },
{ "foo/bar", "foo/bar", 0 }, { "../", "../", 0 },
{ "foo///bar", "foo/bar", 1 }, { "foo", "foo", 0 },
{ "foo/.", "foo/", 1 }, { "foo/bar", "foo/bar", 0 },
{ "foo/./", "foo/", 1 }, { "foo///bar", "foo///bar", 0 },
{ "foo./", "foo./", 0 }, { "foo/.", "foo/", 1 },
{ "foo/../bar", "bar", 1 }, { "foo/./", "foo/", 1 },
{ "foo/../bar/", "bar/", 1 }, { "foo./", "foo./", 0 },
{ "foo/bar/..", "foo/", 1 }, { "foo/../bar", "bar", 1 },
{ "foo/bar/../x", "foo/x", 1 }, { "foo/../bar/", "bar/", 1 },
{ "foo/bar/../x/", "foo/x/", 1 }, { "foo/bar/..", "foo/", 1 },
{ "foo/..", "", 1 }, { "foo/bar/../x", "foo/x", 1 },
{ "foo/../..", "", 1 }, { "foo/bar/../x/", "foo/x/", 1 },
{ "a/b/../../c", "c", 1 }, { "foo/..", "", 1 },
{ "./a/../b", "b", 1 } { "foo/../..", "..", 1 },
{ "foo/../../..", "../..", 1 },
{ "foo/../../bar/../../baz", "../../baz", 1 },
{ "a/b/../../c", "c", 1 },
{ "./a/../b", "b", 1 }
}; };
int i; int i;
@ -2139,24 +2133,5 @@ test_path_simplify (void)
int expected_change = tests[i].should_modify; int expected_change = tests[i].should_modify;
run_test (test, expected_result, expected_change); run_test (test, expected_result, expected_change);
} }
/* Now run all the tests with a leading slash before the test case,
to prove that the slash is being preserved. */
for (i = 0; i < countof (tests); i++)
{
char *test, *expected_result;
int expected_change = tests[i].should_modify;
test = xmalloc (1 + strlen (tests[i].test) + 1);
sprintf (test, "/%s", tests[i].test);
expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
sprintf (expected_result, "/%s", tests[i].result);
run_test (test, expected_result, expected_change);
xfree (test);
xfree (expected_result);
}
} }
#endif #endif