1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Improved --restrict-file-names to accept ",nocontrol".

This commit is contained in:
hniksic 2003-09-16 18:32:05 -07:00
parent b7f202e5e0
commit aa24b822ca
7 changed files with 109 additions and 80 deletions

View File

@ -1,3 +1,8 @@
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
* wget.texi (Download Options): Explain new --restrict-file-names
semantics.
2003-09-16 Hrvoje Niksic <hniksic@xemacs.org>
* wget.texi: Set the man page title to a string more descriptive

View File

@ -808,36 +808,39 @@ this option.
@cindex file names, restrict
@cindex Windows file names
@itemx --restrict-file-names=none|unix|windows
Restrict characters that may occur in local file names created by Wget
from remote URLs. Characters that are considered @dfn{unsafe} under a
set of restrictions are escaped, i.e. replaced with @samp{%XX}, where
@samp{XX} is the hexadecimal code of the character.
@itemx --restrict-file-names=@var{mode}
Change which characters found in remote URLs may show up in local file
names generated from those URLs. Characters that are @dfn{restricted}
by this option are escaped, i.e. replaced with @samp{%HH}, where
@samp{HH} is the hexadecimal number that corresponds to the restricted
character.
The default for this option depends on the operating system: on Unix and
Unix-like OS'es, it defaults to ``unix''. Under Windows and Cygwin, it
defaults to ``windows''. Changing the default is useful when you are
using a non-native partition, e.g. when downloading files to a Windows
partition mounted from Linux, or when using NFS-mounted or SMB-mounted
Windows drives.
By default, Wget escapes the characters that are not valid as part of
file names on your operating system, as well as control characters that
are typically unprintable. This option is useful for changing these
defaults, either because you are downloading to a non-native partition,
or because you want to disable escaping of the control characters.
When set to ``none'', the only characters that are quoted are those that
are impossible to get into a file name---the NUL character and @samp{/}.
The control characters, newline, etc. are all placed into file names.
When mode is set to ``unix'', Wget escapes the character @samp{/} and
the control characters in the ranges 0--31 and 128--159. This is the
default on Unix-like OS'es.
When set to ``unix'', additional unsafe characters are those in the
0--31 range and in the 128--159 range. This is because those characters
are typically not printable.
When set to ``windows'', all of the above are quoted, along with
@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
and @samp{>}. Additionally, Wget in Windows mode uses @samp{+} instead
of @samp{:} to separate host and port in local file names, and uses
When mode is seto to ``windows'', Wget escapes the characters @samp{\},
@samp{|}, @samp{/}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
@samp{>}, and the control characters in the ranges 0--31 and 128--159.
In addition to this, Wget in Windows mode uses @samp{+} instead of
@samp{:} to separate host and port in local file names, and uses
@samp{@@} instead of @samp{?} to separate the query portion of the file
name from the rest. Therefore, a URL that would be saved as
@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be
saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows
mode.
mode. This mode is the default on Windows.
If you append @samp{,nocontrol} to the mode, as in
@samp{unix,nocontrol}, escaping of the control characters is also
switched off. You can use @samp{--restrict-file-names=nocontrol} to
turn off escaping of control characters without affecting the choice of
the OS to use as file name restriction mode.
@end table
@node Directory Options, HTTP Options, Download Options, Invoking
@ -2279,7 +2282,7 @@ Links}).
If set to on, remove @sc{ftp} listings downloaded by Wget. Setting it
to off is the same as @samp{-nr}.
@item restrict_file_names = off/unix/windows
@item restrict_file_names = unix/windows
Restrict the file names generated by Wget from URLs. See
@samp{--restrict-file-names} for a more detailed description.

View File

@ -1,3 +1,14 @@
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
* init.c (cmd_spec_restrict_file_names): Allow the OS setting to
be augmented by ",nocontrol" which means don't escape the control
characters, but otherwise keep OS settings.
* url.c (file_unsafe_char): Deleted.
(append_uri_pathel): Query filechr_table directly.
(filechr_table): Separated Unix, Windows, and control-unsafe
characters.
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
* url.c (url_escape_1): New function.

View File

@ -189,7 +189,7 @@ static struct {
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
{ "retryconnrefused", &opt.retry_connrefused, cmd_boolean },
{ "robots", &opt.use_robots, cmd_boolean },
@ -286,10 +286,11 @@ defaults (void)
/* The default for file name restriction defaults to the OS type. */
#if !defined(WINDOWS) && !defined(__CYGWIN__)
opt.restrict_file_names = restrict_shell;
opt.restrict_files_os = restrict_unix;
#else
opt.restrict_file_names = restrict_windows;
opt.restrict_files_os = restrict_windows;
#endif
opt.restrict_files_ctrl = 1;
}
/* Return the user's home directory (strdup-ed), or NULL if none is
@ -1020,20 +1021,41 @@ cmd_spec_recursive (const char *com, const char *val, void *closure)
static int
cmd_spec_restrict_file_names (const char *com, const char *val, void *closure)
{
/* The currently accepted values are `none', `unix', and
`windows'. */
if (0 == strcasecmp (val, "none"))
opt.restrict_file_names = restrict_none;
else if (0 == strcasecmp (val, "unix"))
opt.restrict_file_names = restrict_shell;
else if (0 == strcasecmp (val, "windows"))
opt.restrict_file_names = restrict_windows;
int restrict_os = opt.restrict_files_os;
int restrict_ctrl = opt.restrict_files_ctrl;
const char *end = strchr (val, ',');
if (!end)
end = val + strlen (val);
#define VAL_IS(string_literal) BOUNDED_EQUAL (val, end, string_literal)
if (VAL_IS ("unix"))
restrict_os = restrict_unix;
else if (VAL_IS ("windows"))
restrict_os = restrict_windows;
else if (VAL_IS ("nocontrol"))
restrict_ctrl = 0;
else
{
err:
fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"),
exec_name, com, val);
return 0;
}
#undef VAL_IS
if (*end)
{
if (!strcmp (end + 1, "nocontrol"))
restrict_ctrl = 0;
else
goto err;
}
opt.restrict_files_os = restrict_os;
opt.restrict_files_ctrl = restrict_ctrl;
return 1;
}

View File

@ -179,7 +179,7 @@ Download:\n\
--bind-address=ADDRESS bind to ADDRESS (hostname or IP) on local host.\n\
--limit-rate=RATE limit download rate to RATE.\n\
--dns-cache=off disable caching DNS lookups.\n\
--restrict-file-names=MODE restrict chars in file names to MODE.\n\
--restrict-file-names=OS restrict chars in file names to ones OS allows.\n\
\n"), stdout);
fputs (_("\
Directories:\n\

View File

@ -186,10 +186,12 @@ struct options
char *post_file_name; /* File to post */
enum {
restrict_none,
restrict_shell,
restrict_unix,
restrict_windows
} restrict_file_names; /* whether we restrict file name chars. */
} restrict_files_os; /* file name restriction ruleset. */
int restrict_files_ctrl; /* non-zero if control chars in URLs
are restricted from appearing in
generated file names. */
};
extern struct options opt;

View File

@ -1479,23 +1479,22 @@ append_char (char ch, struct growable *dest)
}
enum {
filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
filechr_unsafe_windows = 2, /* disallowed on Windows file system */
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
filechr_control = 4, /* a control character, e.g. 0-31 */
};
#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
/* Shorthands for the table: */
#define A filechr_unsafe_always
#define S filechr_unsafe_shell
#define W filechr_unsafe_windows
#define U filechr_not_unix
#define W filechr_not_windows
#define C filechr_control
/* Forbidden chars:
#define UW U|W
#define UWC U|W|C
always: \0, /
Unix shell: 0-31, 128-159
Windows: \, |, /, <, >, ?, :
/* Table of characters unsafe under various conditions (see above).
Arguably we could also claim `%' to be unsafe, since we use it as
the escape character. If we ever want to be able to reliably
@ -1504,12 +1503,12 @@ enum {
const static unsigned char filechr_table[256] =
{
A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
@ -1521,8 +1520,8 @@ const static unsigned char filechr_table[256] =
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -1532,30 +1531,16 @@ const static unsigned char filechr_table[256] =
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/* Return non-zero if character CH is unsafe for use in file or
directory name. Called by append_uri_pathel. */
static inline int
file_unsafe_char (char ch, int restrict)
{
int mask = filechr_unsafe_always;
if (restrict == restrict_shell)
mask |= filechr_unsafe_shell;
else if (restrict == restrict_windows)
mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
return FILE_CHAR_TEST (ch, mask);
}
/* FN_PORT_SEP is the separator between host and port in file names
for non-standard port numbers. On Unix this is normally ':', as in
"www.xemacs.org:4001/index.html". Under Windows, we set it to +
because Windows can't handle ':' in file names. */
#define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
/* FN_QUERY_SEP is the separator between the file name and the URL
query, normally '?'. Since Windows cannot handle '?' as part of
file name, we use '@' instead there. */
#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
/* Quote path element, characters in [b, e), as file name, and append
the quoted string to DEST. Each character is quoted as per
@ -1570,12 +1555,13 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
const char *p;
int quoted, outlen;
/* Currently restrict_for_windows is determined at compile time
only. But some users download files to Windows partitions; they
should be able to say --windows-file-names so Wget escapes
characters invalid on Windows. Similar run-time restrictions for
other file systems can be implemented. */
const int restrict = opt.restrict_file_names;
int mask;
if (opt.restrict_files_os == restrict_unix)
mask = filechr_not_unix;
else
mask = filechr_not_windows;
if (opt.restrict_files_ctrl)
mask |= filechr_control;
/* Copy [b, e) to PATHEL and URL-unescape it. */
BOUNDED_TO_ALLOCA (b, e, pathel);
@ -1586,7 +1572,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
add for file quoting. */
quoted = 0;
for (p = pathel; *p; p++)
if (file_unsafe_char (*p, restrict))
if (FILE_CHAR_TEST (*p, mask))
++quoted;
/* p - pathel is the string length. Each quoted char means two
@ -1605,7 +1591,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
char *q = TAIL (dest);
for (p = pathel; *p; p++)
{
if (!file_unsafe_char (*p, restrict))
if (!FILE_CHAR_TEST (*p, mask))
*q++ = *p;
else
{