1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] New mechanism for quoting file names.

Published in <m3smmzt4px.fsf@hniksic.iskon.hr>.
This commit is contained in:
hniksic 2003-09-14 15:04:13 -07:00
parent ebea9e7e0b
commit 0a3697ad65
13 changed files with 485 additions and 245 deletions

11
NEWS
View File

@ -7,8 +7,6 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
* Changes in Wget 1.9.
** The build process now requires Autoconf 2.5x.
** It is now possible to specify that POST method be used for HTTP
requests. For example, `wget --post-data="id=foo&data=bar" URL' will
send a POST request with the specified contents.
@ -32,6 +30,15 @@ considered a fatal error.
** The new option `--dns-cache=off' may be used to prevent Wget from
caching DNS lookups.
** The build process now requires Autoconf 2.5x.
** Wget no longer quotes characters in local file names that would be
considered "unsafe" as part of URL. Quoting can still occur for
control characters or for '/', but no longer for frequent characters
such as space. You can use the new option --restrict-file-names to
enforce even stricter rules, which is useful when downloading to
Windows partitions.
* Wget 1.8.1 is a bugfix release with no user-visible changes.

View File

@ -1,3 +1,8 @@
2003-09-14 Hrvoje Niksic <hniksic@xemacs.org>
* wget.texi (Download Options): Document the new option
--restrict-file-names and the corresponding wgetrc command.
2003-09-10 Hrvoje Niksic <hniksic@xemacs.org>
* wget.texi (Download Options): Documented new option --dns-cache.

View File

@ -800,6 +800,39 @@ lookups where they're probably not needed.
If you don't understand the above description, you probably won't need
this option.
@cindex file names, restrict
@cindex Windows file names
@itemx --restrict-file-names=none|unix|windows
Restrict characters that may occur in local file names created by Wget
from remote URLs. Characters that are considered @dfn{unsafe} under a
set of restrictions are escaped, i.e. replaced with @samp{%XX}, where
@samp{XX} is the hexadecimal code of the character.
The default for this option depends on the operating system: on Unix and
Unix-like OS'es, it defaults to ``unix''. Under Windows and Cygwin, it
defaults to ``windows''. Changing the default is useful when you are
using a non-native partition, e.g. when downloading files to a Windows
partition mounted from Linux, or when using NFS-mounted or SMB-mounted
Windows drives.
When set to ``none'', the only characters that are quoted are those that
are impossible to get into a file name---the NUL character and @samp{/}.
The control characters, newline, etc. are all placed into file names.
When set to ``unix'', additional unsafe characters are those in the
0--31 range and in the 128--159 range. This is because those characters
are typically not printable.
When set to ``windows'', all of the above are quoted, along with
@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
and @samp{>}. Additionally, Wget in Windows mode uses @samp{+} instead
of @samp{:} to separate host and port in local file names, and uses
@samp{@@} instead of @samp{?} to separate the query portion of the file
name from the rest. Therefore, a URL that would be saved as
@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be
saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows
mode.
@end table
@node Directory Options, HTTP Options, Download Options, Invoking
@ -2241,6 +2274,10 @@ Links}).
If set to on, remove @sc{ftp} listings downloaded by Wget. Setting it
to off is the same as @samp{-nr}.
@item restrict_file_names = off/unix/windows
Restrict the file names generated by Wget from URLs. See
@samp{--restrict-file-names} for a more detailed description.
@item retr_symlinks = on/off
When set to on, retrieve symbolic links as if they were plain files; the
same as @samp{--retr-symlinks}.

View File

@ -1,3 +1,31 @@
2003-09-14 Hrvoje Niksic <hniksic@xemacs.org>
* url.c (append_uri_pathel): Use opt.restrict_file_names when
calling file_unsafe_char.
* init.c: New command restrict_file_names.
* main.c (main): New option --restrict-file-names[=windows,unix].
* url.c (url_file_name): Renamed from url_filename.
(url_file_name): Add directory and hostdir prefix here, not in
mkstruct.
(append_dir_structure): New function, does part of the work that
used to be in mkstruct. Iterates over path elements in u->path,
calling append_uri_pathel on each one to append it to the file
name.
(append_uri_pathel): URL-unescape a path element and reencode it
with a different set of rules, more appropriate for handling of
files.
(file_unsafe_char): New function, uses a lookup table to decide
whether a character should be escaped for use in file name.
(append_string): New utility function.
(append_char): Ditto.
(file_unsafe_char): New argument restrict_for_windows, decide
whether Windows file names should be escaped in run-time.
* connect.c: Include <stdlib.h> to get prototype for abort().
2003-09-14 Hrvoje Niksic <hniksic@xemacs.org>
* utils.c (wtimer_sys_set): Extracted the code that sets the

View File

@ -30,6 +30,7 @@ so, delete this exception statement from your version. */
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
# include <unistd.h>

View File

@ -842,8 +842,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
{
char *tmpu, *tmpp; /* temporary, clean user and passwd */
tmpu = encode_string (u->user);
tmpp = u->passwd ? encode_string (u->passwd) : NULL;
tmpu = url_escape (u->user);
tmpp = u->passwd ? url_escape (u->passwd) : NULL;
upwd = (char *)xmalloc (strlen (tmpu)
+ (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
@ -863,7 +863,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
fprintf (fp, " ");
if (f->tstamp != -1)
{
/* #### Should we translate the months? */
/* #### Should we translate the months? Or, even better, use
ISO 8601 dates? */
static char *months[] = {
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"

View File

@ -1025,7 +1025,7 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con)
struct stat st;
if (!con->target)
con->target = url_filename (u);
con->target = url_file_name (u);
if (opt.noclobber && file_exists_p (con->target))
{
@ -1245,7 +1245,7 @@ ftp_get_listing (struct url *u, ccon *con, struct fileinfo **f)
/* Find the listing file name. We do it by taking the file name of
the URL and replacing the last component with the listing file
name. */
uf = url_filename (u);
uf = url_file_name (u);
lf = file_merge (uf, LIST_FILENAME);
xfree (uf);
DEBUGP ((_("Using `%s' as listing tmp file.\n"), lf));
@ -1335,7 +1335,7 @@ ftp_retrieve_list (struct url *u, struct fileinfo *f, ccon *con)
ofile = xstrdup (u->file);
url_set_file (u, f->name);
con->target = url_filename (u);
con->target = url_file_name (u);
err = RETROK;
dlthis = 1;
@ -1723,7 +1723,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
char *filename = (opt.output_document
? xstrdup (opt.output_document)
: (con.target ? xstrdup (con.target)
: url_filename (u)));
: url_file_name (u)));
res = ftp_index (filename, u, f);
if (res == FTPOK && opt.verbose)
{

View File

@ -1614,12 +1614,12 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
hstat.local_file = local_file;
else if (local_file)
{
*local_file = url_filename (u);
*local_file = url_file_name (u);
hstat.local_file = local_file;
}
else
{
dummy = url_filename (u);
dummy = url_file_name (u);
hstat.local_file = &dummy;
}

View File

@ -100,6 +100,7 @@ CMD_DECLARE (cmd_spec_htmlify);
CMD_DECLARE (cmd_spec_mirror);
CMD_DECLARE (cmd_spec_progress);
CMD_DECLARE (cmd_spec_recursive);
CMD_DECLARE (cmd_spec_restrict_file_names);
CMD_DECLARE (cmd_spec_useragent);
/* List of recognized commands, each consisting of name, closure and function.
@ -188,6 +189,7 @@ static struct {
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
{ "retryconnrefused", &opt.retry_connrefused, cmd_boolean },
{ "robots", &opt.use_robots, cmd_boolean },
@ -281,6 +283,13 @@ defaults (void)
opt.dots_in_line = 50;
opt.dns_cache = 1;
/* The default for file name restriction defaults to the OS type. */
#if !defined(WINDOWS) && !defined(__CYGWIN__)
opt.restrict_file_names = restrict_shell;
#else
opt.restrict_file_names = restrict_windows;
#endif
}
/* Return the user's home directory (strdup-ed), or NULL if none is
@ -1008,6 +1017,26 @@ cmd_spec_recursive (const char *com, const char *val, void *closure)
return 1;
}
static int
cmd_spec_restrict_file_names (const char *com, const char *val, void *closure)
{
/* The currently accepted values are `none', `unix', and
`windows'. */
if (0 == strcasecmp (val, "none"))
opt.restrict_file_names = restrict_none;
else if (0 == strcasecmp (val, "unix"))
opt.restrict_file_names = restrict_shell;
else if (0 == strcasecmp (val, "windows"))
opt.restrict_file_names = restrict_windows;
else
{
fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"),
exec_name, com, val);
return 0;
}
return 1;
}
static int
cmd_spec_useragent (const char *com, const char *val, void *closure)
{

View File

@ -179,10 +179,11 @@ Download:\n\
--bind-address=ADDRESS bind to ADDRESS (hostname or IP) on local host.\n\
--limit-rate=RATE limit download rate to RATE.\n\
--dns-cache=off disable caching DNS lookups.\n\
--restrict-file-names=MODE restrict chars in file names to MODE.\n\
\n"), stdout);
fputs (_("\
Directories:\n\
-nd --no-directories don\'t create directories.\n\
-nd, --no-directories don\'t create directories.\n\
-x, --force-directories force creation of directories.\n\
-nH, --no-host-directories don\'t create host directories.\n\
-P, --directory-prefix=PREFIX save files to PREFIX/...\n\
@ -344,6 +345,7 @@ main (int argc, char *const *argv)
{ "proxy-user", required_argument, NULL, 143 },
{ "quota", required_argument, NULL, 'Q' },
{ "reject", required_argument, NULL, 'R' },
{ "restrict-file-names", required_argument, NULL, 176 },
{ "save-cookies", required_argument, NULL, 162 },
{ "timeout", required_argument, NULL, 'T' },
{ "tries", required_argument, NULL, 't' },
@ -610,6 +612,9 @@ GNU General Public License for more details.\n"));
case 175:
setval ("dnscache", optarg);
break;
case 176:
setval ("restrictfilenames", optarg);
break;
case 'A':
setval ("accept", optarg);
break;

View File

@ -184,6 +184,12 @@ struct options
char *post_data; /* POST query string */
char *post_file_name; /* File to post */
enum {
restrict_none,
restrict_shell,
restrict_windows
} restrict_file_names; /* whether we restrict file name chars. */
};
extern struct options opt;

557
src/url.c
View File

@ -1,5 +1,6 @@
/* URL handling.
Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
Free Software Foundation, Inc.
This file is part of GNU Wget.
@ -95,24 +96,22 @@ static int path_simplify PARAMS ((char *));
code assumes ASCII character set and 8-bit chars. */
enum {
/* rfc1738 reserved chars, preserved from encoding. */
urlchr_reserved = 1,
/* rfc1738 unsafe chars, plus some more. */
urlchr_unsafe = 2
};
#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
/* Shorthands for the table: */
#define R urlchr_reserved
#define U urlchr_unsafe
#define RU R|U
#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
/* rfc1738 reserved chars, preserved from encoding. */
#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
/* rfc1738 unsafe chars, plus some more. */
#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
const static unsigned char urlchr_table[256] =
{
U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
@ -142,6 +141,9 @@ const static unsigned char urlchr_table[256] =
U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
};
#undef R
#undef U
#undef RU
/* Decodes the forms %xy in a URL to the character the hexadecimal
code of which is xy. xy are hexadecimal digits from
@ -150,7 +152,7 @@ const static unsigned char urlchr_table[256] =
literally. */
static void
decode_string (char *s)
url_unescape (char *s)
{
char *t = s; /* t - tortoise */
char *h = s; /* h - hare */
@ -175,10 +177,10 @@ decode_string (char *s)
*t = '\0';
}
/* Like encode_string, but return S if there are no unsafe chars. */
/* Like url_escape, but return S if there are no unsafe chars. */
static char *
encode_string_maybe (const char *s)
url_escape_allow_passthrough (const char *s)
{
const char *p1;
char *p2, *newstr;
@ -186,7 +188,7 @@ encode_string_maybe (const char *s)
int addition = 0;
for (p1 = s; *p1; p1++)
if (UNSAFE_CHAR (*p1))
if (URL_UNSAFE_CHAR (*p1))
addition += 2; /* Two more characters (hex digits) */
if (!addition)
@ -199,7 +201,7 @@ encode_string_maybe (const char *s)
p2 = newstr;
while (*p1)
{
if (UNSAFE_CHAR (*p1))
if (URL_UNSAFE_CHAR (*p1))
{
unsigned char c = *p1++;
*p2++ = '%';
@ -215,13 +217,13 @@ encode_string_maybe (const char *s)
return newstr;
}
/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
given string, returning a malloc-ed %XX encoded string. */
char *
encode_string (const char *s)
url_escape (const char *s)
{
char *encoded = encode_string_maybe (s);
char *encoded = url_escape_allow_passthrough (s);
if (encoded != s)
return encoded;
else
@ -233,7 +235,7 @@ encode_string (const char *s)
allocated storage. */
#define ENCODE(ptr) do { \
char *e_new = encode_string_maybe (ptr); \
char *e_new = url_escape_allow_passthrough (ptr); \
if (e_new != ptr) \
{ \
xfree (ptr); \
@ -258,7 +260,7 @@ decide_copy_method (const char *p)
char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
XCHAR_TO_XDIGIT (*(p + 2));
if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
return CM_PASSTHROUGH;
else
return CM_DECODE;
@ -267,20 +269,20 @@ decide_copy_method (const char *p)
/* Garbled %.. sequence: encode `%'. */
return CM_ENCODE;
}
else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
return CM_ENCODE;
else
return CM_PASSTHROUGH;
}
/* Translate a %-quoting (but possibly non-conformant) input string S
into a %-quoting (and conformant) output string. If no characters
/* Translate a %-escaped (but possibly non-conformant) input string S
into a %-escaped (and conformant) output string. If no characters
are encoded or decoded, return the same string S; otherwise, return
a freshly allocated string with the new contents.
After a URL has been run through this function, the protocols that
use `%' as the quote character can use the resulting string as-is,
while those that don't call decode_string() to get to the intended
while those that don't call url_unescape() to get to the intended
data. This function is also stable: after an input string is
transformed the first time, all further transformations of the
result yield the same result string.
@ -293,20 +295,21 @@ decide_copy_method (const char *p)
GET /abc%20def HTTP/1.0
So it appears that the unsafe chars need to be quoted, as with
encode_string. But what if we're requested to download
`abc%20def'? Remember that %-encoding is valid URL syntax, so what
the user meant was a literal space, and he was kind enough to quote
it. In that case, Wget should obviously leave the `%20' as is, and
send the same request as above. So in this case we may not call
encode_string.
It appears that the unsafe chars need to be quoted, for example
with url_escape. But what if we're requested to download
`abc%20def'? url_escape transforms "%" to "%25", which would leave
us with `abc%2520def'. This is incorrect -- since %-escapes are
part of URL syntax, "%20" is the correct way to denote a literal
space on the Wget command line. This leaves us in the conclusion
that in that case Wget should not call url_escape, but leave the
`%20' as is.
But what if the requested URI is `abc%20 def'? If we call
encode_string, we end up with `/abc%2520%20def', which is almost
certainly not intended. If we don't call encode_string, we are
left with the embedded space and cannot send the request. What the
And what if the requested URI is `abc%20 def'? If we call
url_escape, we end up with `/abc%2520%20def', which is almost
certainly not intended. If we don't call url_escape, we are left
with the embedded space and cannot complete the request. What the
user meant was for Wget to request `/abc%20%20def', and this is
where reencode_string kicks in.
where reencode_escapes kicks in.
Wget used to solve this by first decoding %-quotes, and then
encoding all the "unsafe" characters found in the resulting string.
@ -317,7 +320,7 @@ decide_copy_method (const char *p)
is inevitable because by the second step we would lose information
on whether the `+' was originally encoded or not. Both results
were wrong because in CGI parameters + means space, while %2B means
literal plus. reencode_string correctly translates the above to
literal plus. reencode_escapes correctly translates the above to
"a%2B+b", i.e. returns the original string.
This function uses an algorithm proposed by Anon Sricharoenchai:
@ -352,7 +355,7 @@ decide_copy_method (const char *p)
"foo%2b+bar" -> "foo%2b+bar" */
static char *
reencode_string (const char *s)
reencode_escapes (const char *s)
{
const char *p1;
char *newstr, *p2;
@ -417,12 +420,12 @@ reencode_string (const char *s)
return newstr;
}
/* Run PTR_VAR through reencode_string. If a new string is consed,
/* Run PTR_VAR through reencode_escapes. If a new string is consed,
free PTR_VAR and make it point to the new storage. Obviously,
PTR_VAR needs to be an lvalue. */
#define REENCODE(ptr_var) do { \
char *rf_new = reencode_string (ptr_var); \
char *rf_new = reencode_escapes (ptr_var); \
if (rf_new != ptr_var) \
{ \
xfree (ptr_var); \
@ -544,9 +547,9 @@ parse_uname (const char *str, int len, char **user, char **passwd)
(*user)[len] = '\0';
if (*user)
decode_string (*user);
url_unescape (*user);
if (*passwd)
decode_string (*passwd);
url_unescape (*passwd);
return 1;
}
@ -611,6 +614,10 @@ rewrite_shorthand_url (const char *url)
static void parse_path PARAMS ((const char *, char **, char **));
/* Like strpbrk, with the exception that it returns the pointer to the
terminating zero (end-of-string aka "eos") if no matching character
is found. */
static char *
strpbrk_or_eos (const char *s, const char *accept)
{
@ -825,7 +832,7 @@ url_parse (const char *url, int *error)
return NULL;
}
url_encoded = reencode_string (url);
url_encoded = reencode_escapes (url);
p = url_encoded;
p += strlen (supported_schemes[scheme].leading_string);
@ -1032,13 +1039,13 @@ url_error (int error_code)
return parse_errors[error_code];
}
static void
parse_path (const char *quoted_path, char **dir, char **file)
{
char *path, *last_slash;
/* Parse PATH into dir and file. PATH is extracted from the URL and
is URL-escaped. The function returns unescaped DIR and FILE. */
STRDUP_ALLOCA (path, quoted_path);
decode_string (path);
static void
parse_path (const char *path, char **dir, char **file)
{
char *last_slash;
last_slash = strrchr (path, '/');
if (!last_slash)
@ -1051,6 +1058,8 @@ parse_path (const char *quoted_path, char **dir, char **file)
*dir = strdupdelim (path, last_slash);
*file = xstrdup (last_slash + 1);
}
url_unescape (*dir);
url_unescape (*file);
}
/* Note: URL's "full path" is the path with the query string and
@ -1303,8 +1312,6 @@ rotate_backups(const char *fname)
{
sprintf (from, "%s.%d", fname, i - 1);
sprintf (to, "%s.%d", fname, i);
/* #### This will fail on machines without the rename() system
call. */
rename (from, to);
}
@ -1323,11 +1330,14 @@ mkalldirs (const char *path)
int res;
p = path + strlen (path);
for (; *p != '/' && p != path; p--);
for (; *p != '/' && p != path; p--)
;
/* Don't create if it's just a file. */
if ((p == path) && (*p != '/'))
return 0;
t = strdupdelim (path, p);
/* Check whether the directory exists. */
if ((stat (t, &st) == 0))
{
@ -1360,194 +1370,302 @@ mkalldirs (const char *path)
xfree (t);
return res;
}
/* Functions for constructing the file name out of URL components. */
static int
count_slashes (const char *s)
/* A growable string structure, used by url_file_name and friends.
This should perhaps be moved to utils.c.
The idea is to have an easy way to construct a string by having
various functions append data to it. Instead of passing the
obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
questions, we pass the pointer to this struct. */
struct growable {
char *base;
int size;
int tail;
};
/* Ensure that the string can accept APPEND_COUNT more characters past
the current TAIL position. If necessary, this will grow the string
and update its allocated size. If the string is already large
enough to take TAIL+APPEND_COUNT characters, this does nothing. */
#define GROW(g, append_size) do { \
struct growable *G_ = g; \
DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
} while (0)
/* Return the tail position of the string. */
#define TAIL(r) ((r)->base + (r)->tail)
/* Move the tail position by APPEND_COUNT characters. */
#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
/* Append the string STR to DEST. NOTICE: the string in DEST is not
terminated. */
static void
append_string (const char *str, struct growable *dest)
{
int i = 0;
while (*s)
if (*s++ == '/')
++i;
return i;
int l = strlen (str);
GROW (dest, l);
memcpy (TAIL (dest), str, l);
TAIL_INCR (dest, l);
}
/* Return the path name of the URL-equivalent file name, with a
remote-like structure of directories. */
static char *
mkstruct (const struct url *u)
{
char *dir, *file;
char *res, *dirpref;
int l;
/* Append CH to DEST. For example, append_char (0, DEST)
zero-terminates DEST. */
if (opt.cut_dirs)
static void
append_char (char ch, struct growable *dest)
{
char *ptr = u->dir + (*u->dir == '/');
int slash_count = 1 + count_slashes (ptr);
int cut = MINVAL (opt.cut_dirs, slash_count);
for (; cut && *ptr; ptr++)
if (*ptr == '/')
--cut;
STRDUP_ALLOCA (dir, ptr);
GROW (dest, 1);
*TAIL (dest) = ch;
TAIL_INCR (dest, 1);
}
enum {
filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
filechr_unsafe_windows = 2, /* disallowed on Windows file system */
};
#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
/* Shorthands for the table: */
#define A filechr_unsafe_always
#define S filechr_unsafe_shell
#define W filechr_unsafe_windows
/* Forbidden chars:
always: \0, /
Unix shell: 0-31, 128-159
Windows: \, |, /, <, >, ?, :
Arguably we could also claim `%' to be unsafe, since we use it as
the escape character. If we ever want to be able to reliably
translate file name back to URL, this would become important
crucial. Right now, it's better to be minimal in escaping. */
const static unsigned char filechr_table[256] =
{
A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* Return non-zero if character CH is unsafe for use in file or
directory name. Called by append_uri_pathel. */
static inline int
file_unsafe_char (char ch, int restrict)
{
int mask = filechr_unsafe_always;
if (restrict == restrict_shell)
mask |= filechr_unsafe_shell;
else if (restrict == restrict_windows)
mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
return FILE_CHAR_TEST (ch, mask);
}
/* FN_PORT_SEP is the separator between host and port in file names
for non-standard port numbers. On Unix this is normally ':', as in
"www.xemacs.org:4001/index.html". Under Windows, we set it to +
because Windows can't handle ':' in file names. */
#define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
/* FN_QUERY_SEP is the separator between the file name and the URL
query, normally '?'. Since Windows cannot handle '?' as part of
file name, we use '@' instead there. */
#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
/* Quote path element, characters in [b, e), as file name, and append
the quoted string to DEST. Each character is quoted as per
file_unsafe_char and the corresponding table. */
static void
append_uri_pathel (const char *b, const char *e, struct growable *dest)
{
char *pathel;
int pathlen;
const char *p;
int quoted, outlen;
/* Currently restrict_for_windows is determined at compile time
only. But some users download files to Windows partitions; they
should be able to say --windows-file-names so Wget escapes
characters invalid on Windows. Similar run-time restrictions for
other file systems can be implemented. */
const int restrict = opt.restrict_file_names;
/* Copy [b, e) to PATHEL and URL-unescape it. */
BOUNDED_TO_ALLOCA (b, e, pathel);
url_unescape (pathel);
pathlen = strlen (pathel);
/* Go through PATHEL and check how many characters we'll need to
add for file quoting. */
quoted = 0;
for (p = pathel; *p; p++)
if (file_unsafe_char (*p, restrict))
++quoted;
/* p - pathel is the string length. Each quoted char means two
additional characters in the string, hence 2*quoted. */
outlen = (p - pathel) + (2 * quoted);
GROW (dest, outlen);
if (!quoted)
{
/* If there's nothing to quote, we don't need to go through the
string the second time. */
memcpy (TAIL (dest), pathel, outlen);
}
else
dir = u->dir + (*u->dir == '/');
/* Check for the true name (or at least a consistent name for saving
to directory) of HOST, reusing the hlist if possible. */
if (opt.add_hostdir)
{
/* Add dir_prefix and hostname (if required) to the beginning of
dir. */
dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
+ strlen (u->host)
+ 1 + numdigit (u->port)
+ 1);
if (!DOTP (opt.dir_prefix))
sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
char *q = TAIL (dest);
for (p = pathel; *p; p++)
{
if (!file_unsafe_char (*p, restrict))
*q++ = *p;
else
strcpy (dirpref, u->host);
if (u->port != scheme_default_port (u->scheme))
{
int len = strlen (dirpref);
dirpref[len] = ':';
number_to_string (dirpref + len + 1, u->port);
unsigned char ch = *p;
*q++ = '%';
*q++ = XDIGIT_TO_XCHAR (ch >> 4);
*q++ = XDIGIT_TO_XCHAR (ch & 0xf);
}
}
else /* not add_hostdir */
assert (q - TAIL (dest) == outlen);
}
TAIL_INCR (dest, outlen);
}
/* Append to DEST the directory structure that corresponds the
directory part of URL's path. For example, if the URL is
http://server/dir1/dir2/file, this appends "/dir1/dir2".
Each path element ("dir1" and "dir2" in the above example) is
examined, url-unescaped, and re-escaped as file name element.
Additionally, it cuts as many directories from the path as
specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
will produce "bar" for the above example. For 2 or more, it will
produce "".
Each component of the path is quoted for use as file name. */
static void
append_dir_structure (const struct url *u, struct growable *dest)
{
if (!DOTP (opt.dir_prefix))
dirpref = opt.dir_prefix;
else
dirpref = "";
}
char *pathel, *next;
int cut = opt.cut_dirs;
/* If there is a prefix, prepend it. */
if (*dirpref)
/* Go through the path components, de-URL-quote them, and quote them
(if necessary) as file names. */
pathel = u->path;
for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
{
char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
dir = newdir;
}
if (cut-- > 0)
continue;
if (pathel == next)
/* Ignore empty pathels. path_simplify should remove
occurrences of "//" from the path, but it has special cases
for starting / which generates an empty pathel here. */
continue;
l = strlen (dir);
if (l && dir[l - 1] == '/')
dir[l - 1] = '\0';
if (!*u->file)
file = "index.html";
else
file = u->file;
/* Finally, construct the full name. */
res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
+ 1);
sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
return res;
}
/* Compose a file name out of BASE, an unescaped file name, and QUERY,
an escaped query string. The trick is to make sure that unsafe
characters in BASE are escaped, and that slashes in QUERY are also
escaped. */
static char *
compose_file_name (char *base, char *query)
{
char result[256];
char *from;
char *to = result;
/* Copy BASE to RESULT and encode all unsafe characters. */
from = base;
while (*from && to - result < sizeof (result))
{
if (UNSAFE_CHAR (*from))
{
unsigned char c = *from++;
*to++ = '%';
*to++ = XDIGIT_TO_XCHAR (c >> 4);
*to++ = XDIGIT_TO_XCHAR (c & 0xf);
}
else
*to++ = *from++;
}
if (query && to - result < sizeof (result))
{
*to++ = '?';
/* Copy QUERY to RESULT and encode all '/' characters. */
from = query;
while (*from && to - result < sizeof (result))
{
if (*from == '/')
{
*to++ = '%';
*to++ = '2';
*to++ = 'F';
++from;
}
else
*to++ = *from++;
if (dest->tail)
append_char ('/', dest);
append_uri_pathel (pathel, next, dest);
}
}
if (to - result < sizeof (result))
*to = '\0';
else
/* Truncate input which is too long, presumably due to a huge
query string. */
result[sizeof (result) - 1] = '\0';
/* Return a unique file name that matches the given URL as good as
possible. Does not create directories on the file system. */
return xstrdup (result);
}
/* Create a unique filename, corresponding to a given URL. Calls
mkstruct if necessary. Does *not* actually create any directories. */
char *
url_filename (const struct url *u)
url_file_name (const struct url *u)
{
char *file, *name;
struct growable fnres;
char *query = u->query && *u->query ? u->query : NULL;
char *u_file, *u_query;
char *fname, *unique;
fnres.base = NULL;
fnres.size = 0;
fnres.tail = 0;
/* Start with the directory prefix, if specified. */
if (!DOTP (opt.dir_prefix))
append_string (opt.dir_prefix, &fnres);
/* If "dirstruct" is turned on (typically the case with -r), add
the host and port (unless those have been turned off) and
directory structure. */
if (opt.dirstruct)
{
char *base = mkstruct (u);
file = compose_file_name (base, query);
xfree (base);
}
else
if (opt.add_hostdir)
{
char *base = *u->file ? u->file : "index.html";
file = compose_file_name (base, query);
/* Check whether the prefix directory is something other than "."
before prepending it. */
if (!DOTP (opt.dir_prefix))
if (fnres.tail)
append_char ('/', &fnres);
append_string (u->host, &fnres);
if (u->port != scheme_default_port (u->scheme))
{
/* #### should just realloc FILE and prepend dir_prefix. */
char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
+ 1 + strlen (file) + 1);
sprintf (nfile, "%s/%s", opt.dir_prefix, file);
xfree (file);
file = nfile;
char portstr[24];
number_to_string (portstr, u->port);
append_char (FN_PORT_SEP, &fnres);
append_string (portstr, &fnres);
}
}
/* DOS-ish file systems don't like `%' signs in them; we change it
to `@'. */
#ifdef WINDOWS
{
char *p = file;
for (p = file; *p; p++)
if (*p == '%')
*p = '@';
append_dir_structure (u, &fnres);
}
#endif /* WINDOWS */
/* Add the file name. */
if (fnres.tail)
append_char ('/', &fnres);
u_file = *u->file ? u->file : "index.html";
append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
/* Append "?query" to the file name. */
u_query = u->query && *u->query ? u->query : NULL;
if (u_query)
{
append_char (FN_QUERY_SEP, &fnres);
append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
}
/* Zero-terminate the file name. */
append_char ('\0', &fnres);
fname = fnres.base;
/* Check the cases in which the unique extensions are not used:
1) Clobbering is turned off (-nc).
@ -1557,17 +1675,18 @@ url_filename (const struct url *u)
The exception is the case when file does exist and is a
directory (actually support for bad httpd-s). */
if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
&& !(file_exists_p (file) && !file_non_directory_p (file)))
return file;
&& !(file_exists_p (fname) && !file_non_directory_p (fname)))
return fnres.base;
/* Find a unique name. */
name = unique_name (file);
xfree (file);
return name;
unique = unique_name (fname);
xfree (fname);
return unique;
}
/* Return the langth of URL's path. Path is considered to be
/* Return the length of URL's path. Path is considered to be
terminated by one of '?', ';', '#', or by the end of the
string. */
static int
@ -1680,8 +1799,10 @@ path_simplify (char *path)
else if (*p == '/')
{
/* Remove empty path elements. Not mandated by rfc1808 et
al, but empty path elements are not all that useful, and
the rest of Wget might not deal with them well. */
al, but it seems like a good idea to get rid of them.
Supporting them properly is hard (in which directory do
you save http://x.com///y.html?) and they don't seem to
bring much gain. */
char *q = p;
while (*q == '/')
++q;
@ -1964,13 +2085,13 @@ url_string (const struct url *url, int hide_password)
/* Make sure the user name and password are quoted. */
if (url->user)
{
quoted_user = encode_string_maybe (url->user);
quoted_user = url_escape_allow_passthrough (url->user);
if (url->passwd)
{
if (hide_password)
quoted_passwd = HIDDEN_PASSWORD;
else
quoted_passwd = encode_string_maybe (url->passwd);
quoted_passwd = url_escape_allow_passthrough (url->passwd);
}
}

View File

@ -130,7 +130,7 @@ typedef enum
/* Function declarations */
char *encode_string PARAMS ((const char *));
char *url_escape PARAMS ((const char *));
struct url *url_parse PARAMS ((const char *, int *));
const char *url_error PARAMS ((int));
@ -157,7 +157,7 @@ char *uri_merge PARAMS ((const char *, const char *));
void rotate_backups PARAMS ((const char *));
int mkalldirs PARAMS ((const char *));
char *url_filename PARAMS ((const struct url *));
char *url_file_name PARAMS ((const struct url *));
char *getproxy PARAMS ((struct url *));
int no_proxy_match PARAMS ((const char *, const char **));