mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Improved --restrict-file-names to accept ",nocontrol".
This commit is contained in:
parent
b7f202e5e0
commit
aa24b822ca
@ -1,3 +1,8 @@
|
||||
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* wget.texi (Download Options): Explain new --restrict-file-names
|
||||
semantics.
|
||||
|
||||
2003-09-16 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* wget.texi: Set the man page title to a string more descriptive
|
||||
|
@ -808,36 +808,39 @@ this option.
|
||||
|
||||
@cindex file names, restrict
|
||||
@cindex Windows file names
|
||||
@itemx --restrict-file-names=none|unix|windows
|
||||
Restrict characters that may occur in local file names created by Wget
|
||||
from remote URLs. Characters that are considered @dfn{unsafe} under a
|
||||
set of restrictions are escaped, i.e. replaced with @samp{%XX}, where
|
||||
@samp{XX} is the hexadecimal code of the character.
|
||||
@itemx --restrict-file-names=@var{mode}
|
||||
Change which characters found in remote URLs may show up in local file
|
||||
names generated from those URLs. Characters that are @dfn{restricted}
|
||||
by this option are escaped, i.e. replaced with @samp{%HH}, where
|
||||
@samp{HH} is the hexadecimal number that corresponds to the restricted
|
||||
character.
|
||||
|
||||
The default for this option depends on the operating system: on Unix and
|
||||
Unix-like OS'es, it defaults to ``unix''. Under Windows and Cygwin, it
|
||||
defaults to ``windows''. Changing the default is useful when you are
|
||||
using a non-native partition, e.g. when downloading files to a Windows
|
||||
partition mounted from Linux, or when using NFS-mounted or SMB-mounted
|
||||
Windows drives.
|
||||
By default, Wget escapes the characters that are not valid as part of
|
||||
file names on your operating system, as well as control characters that
|
||||
are typically unprintable. This option is useful for changing these
|
||||
defaults, either because you are downloading to a non-native partition,
|
||||
or because you want to disable escaping of the control characters.
|
||||
|
||||
When set to ``none'', the only characters that are quoted are those that
|
||||
are impossible to get into a file name---the NUL character and @samp{/}.
|
||||
The control characters, newline, etc. are all placed into file names.
|
||||
When mode is set to ``unix'', Wget escapes the character @samp{/} and
|
||||
the control characters in the ranges 0--31 and 128--159. This is the
|
||||
default on Unix-like OS'es.
|
||||
|
||||
When set to ``unix'', additional unsafe characters are those in the
|
||||
0--31 range and in the 128--159 range. This is because those characters
|
||||
are typically not printable.
|
||||
|
||||
When set to ``windows'', all of the above are quoted, along with
|
||||
@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
|
||||
and @samp{>}. Additionally, Wget in Windows mode uses @samp{+} instead
|
||||
of @samp{:} to separate host and port in local file names, and uses
|
||||
When mode is seto to ``windows'', Wget escapes the characters @samp{\},
|
||||
@samp{|}, @samp{/}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
|
||||
@samp{>}, and the control characters in the ranges 0--31 and 128--159.
|
||||
In addition to this, Wget in Windows mode uses @samp{+} instead of
|
||||
@samp{:} to separate host and port in local file names, and uses
|
||||
@samp{@@} instead of @samp{?} to separate the query portion of the file
|
||||
name from the rest. Therefore, a URL that would be saved as
|
||||
@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be
|
||||
saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows
|
||||
mode.
|
||||
mode. This mode is the default on Windows.
|
||||
|
||||
If you append @samp{,nocontrol} to the mode, as in
|
||||
@samp{unix,nocontrol}, escaping of the control characters is also
|
||||
switched off. You can use @samp{--restrict-file-names=nocontrol} to
|
||||
turn off escaping of control characters without affecting the choice of
|
||||
the OS to use as file name restriction mode.
|
||||
@end table
|
||||
|
||||
@node Directory Options, HTTP Options, Download Options, Invoking
|
||||
@ -2279,7 +2282,7 @@ Links}).
|
||||
If set to on, remove @sc{ftp} listings downloaded by Wget. Setting it
|
||||
to off is the same as @samp{-nr}.
|
||||
|
||||
@item restrict_file_names = off/unix/windows
|
||||
@item restrict_file_names = unix/windows
|
||||
Restrict the file names generated by Wget from URLs. See
|
||||
@samp{--restrict-file-names} for a more detailed description.
|
||||
|
||||
|
@ -1,3 +1,14 @@
|
||||
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* init.c (cmd_spec_restrict_file_names): Allow the OS setting to
|
||||
be augmented by ",nocontrol" which means don't escape the control
|
||||
characters, but otherwise keep OS settings.
|
||||
|
||||
* url.c (file_unsafe_char): Deleted.
|
||||
(append_uri_pathel): Query filechr_table directly.
|
||||
(filechr_table): Separated Unix, Windows, and control-unsafe
|
||||
characters.
|
||||
|
||||
2003-09-17 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* url.c (url_escape_1): New function.
|
||||
|
44
src/init.c
44
src/init.c
@ -189,7 +189,7 @@ static struct {
|
||||
{ "reject", &opt.rejects, cmd_vector },
|
||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
||||
{ "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names },
|
||||
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
|
||||
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
|
||||
{ "retryconnrefused", &opt.retry_connrefused, cmd_boolean },
|
||||
{ "robots", &opt.use_robots, cmd_boolean },
|
||||
@ -286,10 +286,11 @@ defaults (void)
|
||||
|
||||
/* The default for file name restriction defaults to the OS type. */
|
||||
#if !defined(WINDOWS) && !defined(__CYGWIN__)
|
||||
opt.restrict_file_names = restrict_shell;
|
||||
opt.restrict_files_os = restrict_unix;
|
||||
#else
|
||||
opt.restrict_file_names = restrict_windows;
|
||||
opt.restrict_files_os = restrict_windows;
|
||||
#endif
|
||||
opt.restrict_files_ctrl = 1;
|
||||
}
|
||||
|
||||
/* Return the user's home directory (strdup-ed), or NULL if none is
|
||||
@ -1020,20 +1021,41 @@ cmd_spec_recursive (const char *com, const char *val, void *closure)
|
||||
static int
|
||||
cmd_spec_restrict_file_names (const char *com, const char *val, void *closure)
|
||||
{
|
||||
/* The currently accepted values are `none', `unix', and
|
||||
`windows'. */
|
||||
if (0 == strcasecmp (val, "none"))
|
||||
opt.restrict_file_names = restrict_none;
|
||||
else if (0 == strcasecmp (val, "unix"))
|
||||
opt.restrict_file_names = restrict_shell;
|
||||
else if (0 == strcasecmp (val, "windows"))
|
||||
opt.restrict_file_names = restrict_windows;
|
||||
int restrict_os = opt.restrict_files_os;
|
||||
int restrict_ctrl = opt.restrict_files_ctrl;
|
||||
|
||||
const char *end = strchr (val, ',');
|
||||
if (!end)
|
||||
end = val + strlen (val);
|
||||
|
||||
#define VAL_IS(string_literal) BOUNDED_EQUAL (val, end, string_literal)
|
||||
|
||||
if (VAL_IS ("unix"))
|
||||
restrict_os = restrict_unix;
|
||||
else if (VAL_IS ("windows"))
|
||||
restrict_os = restrict_windows;
|
||||
else if (VAL_IS ("nocontrol"))
|
||||
restrict_ctrl = 0;
|
||||
else
|
||||
{
|
||||
err:
|
||||
fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"),
|
||||
exec_name, com, val);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#undef VAL_IS
|
||||
|
||||
if (*end)
|
||||
{
|
||||
if (!strcmp (end + 1, "nocontrol"))
|
||||
restrict_ctrl = 0;
|
||||
else
|
||||
goto err;
|
||||
}
|
||||
|
||||
opt.restrict_files_os = restrict_os;
|
||||
opt.restrict_files_ctrl = restrict_ctrl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -179,7 +179,7 @@ Download:\n\
|
||||
--bind-address=ADDRESS bind to ADDRESS (hostname or IP) on local host.\n\
|
||||
--limit-rate=RATE limit download rate to RATE.\n\
|
||||
--dns-cache=off disable caching DNS lookups.\n\
|
||||
--restrict-file-names=MODE restrict chars in file names to MODE.\n\
|
||||
--restrict-file-names=OS restrict chars in file names to ones OS allows.\n\
|
||||
\n"), stdout);
|
||||
fputs (_("\
|
||||
Directories:\n\
|
||||
|
@ -186,10 +186,12 @@ struct options
|
||||
char *post_file_name; /* File to post */
|
||||
|
||||
enum {
|
||||
restrict_none,
|
||||
restrict_shell,
|
||||
restrict_unix,
|
||||
restrict_windows
|
||||
} restrict_file_names; /* whether we restrict file name chars. */
|
||||
} restrict_files_os; /* file name restriction ruleset. */
|
||||
int restrict_files_ctrl; /* non-zero if control chars in URLs
|
||||
are restricted from appearing in
|
||||
generated file names. */
|
||||
};
|
||||
|
||||
extern struct options opt;
|
||||
|
68
src/url.c
68
src/url.c
@ -1479,23 +1479,22 @@ append_char (char ch, struct growable *dest)
|
||||
}
|
||||
|
||||
enum {
|
||||
filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
|
||||
filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
|
||||
filechr_unsafe_windows = 2, /* disallowed on Windows file system */
|
||||
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
|
||||
filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
|
||||
filechr_control = 4, /* a control character, e.g. 0-31 */
|
||||
};
|
||||
|
||||
#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
|
||||
|
||||
/* Shorthands for the table: */
|
||||
#define A filechr_unsafe_always
|
||||
#define S filechr_unsafe_shell
|
||||
#define W filechr_unsafe_windows
|
||||
#define U filechr_not_unix
|
||||
#define W filechr_not_windows
|
||||
#define C filechr_control
|
||||
|
||||
/* Forbidden chars:
|
||||
#define UW U|W
|
||||
#define UWC U|W|C
|
||||
|
||||
always: \0, /
|
||||
Unix shell: 0-31, 128-159
|
||||
Windows: \, |, /, <, >, ?, :
|
||||
/* Table of characters unsafe under various conditions (see above).
|
||||
|
||||
Arguably we could also claim `%' to be unsafe, since we use it as
|
||||
the escape character. If we ever want to be able to reliably
|
||||
@ -1504,12 +1503,12 @@ enum {
|
||||
|
||||
const static unsigned char filechr_table[256] =
|
||||
{
|
||||
A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
|
||||
S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
|
||||
S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
|
||||
S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
|
||||
UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
|
||||
C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
|
||||
C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
|
||||
C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
|
||||
0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
|
||||
0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
|
||||
0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
|
||||
0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
|
||||
@ -1521,8 +1520,8 @@ const static unsigned char filechr_table[256] =
|
||||
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
|
||||
|
||||
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
|
||||
S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
|
||||
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
|
||||
C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@ -1532,30 +1531,16 @@ const static unsigned char filechr_table[256] =
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
/* Return non-zero if character CH is unsafe for use in file or
|
||||
directory name. Called by append_uri_pathel. */
|
||||
|
||||
static inline int
|
||||
file_unsafe_char (char ch, int restrict)
|
||||
{
|
||||
int mask = filechr_unsafe_always;
|
||||
if (restrict == restrict_shell)
|
||||
mask |= filechr_unsafe_shell;
|
||||
else if (restrict == restrict_windows)
|
||||
mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
|
||||
return FILE_CHAR_TEST (ch, mask);
|
||||
}
|
||||
|
||||
/* FN_PORT_SEP is the separator between host and port in file names
|
||||
for non-standard port numbers. On Unix this is normally ':', as in
|
||||
"www.xemacs.org:4001/index.html". Under Windows, we set it to +
|
||||
because Windows can't handle ':' in file names. */
|
||||
#define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
|
||||
#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
|
||||
|
||||
/* FN_QUERY_SEP is the separator between the file name and the URL
|
||||
query, normally '?'. Since Windows cannot handle '?' as part of
|
||||
file name, we use '@' instead there. */
|
||||
#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
|
||||
#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
|
||||
|
||||
/* Quote path element, characters in [b, e), as file name, and append
|
||||
the quoted string to DEST. Each character is quoted as per
|
||||
@ -1570,12 +1555,13 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
|
||||
const char *p;
|
||||
int quoted, outlen;
|
||||
|
||||
/* Currently restrict_for_windows is determined at compile time
|
||||
only. But some users download files to Windows partitions; they
|
||||
should be able to say --windows-file-names so Wget escapes
|
||||
characters invalid on Windows. Similar run-time restrictions for
|
||||
other file systems can be implemented. */
|
||||
const int restrict = opt.restrict_file_names;
|
||||
int mask;
|
||||
if (opt.restrict_files_os == restrict_unix)
|
||||
mask = filechr_not_unix;
|
||||
else
|
||||
mask = filechr_not_windows;
|
||||
if (opt.restrict_files_ctrl)
|
||||
mask |= filechr_control;
|
||||
|
||||
/* Copy [b, e) to PATHEL and URL-unescape it. */
|
||||
BOUNDED_TO_ALLOCA (b, e, pathel);
|
||||
@ -1586,7 +1572,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
|
||||
add for file quoting. */
|
||||
quoted = 0;
|
||||
for (p = pathel; *p; p++)
|
||||
if (file_unsafe_char (*p, restrict))
|
||||
if (FILE_CHAR_TEST (*p, mask))
|
||||
++quoted;
|
||||
|
||||
/* p - pathel is the string length. Each quoted char means two
|
||||
@ -1605,7 +1591,7 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest)
|
||||
char *q = TAIL (dest);
|
||||
for (p = pathel; *p; p++)
|
||||
{
|
||||
if (!file_unsafe_char (*p, restrict))
|
||||
if (!FILE_CHAR_TEST (*p, mask))
|
||||
*q++ = *p;
|
||||
else
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user