[svn] New mechanism for quoting file names.

Published in <m3smmzt4px.fsf@hniksic.iskon.hr>.
2024-07-03 16:38:41 -04:00 · 2003-09-14 15:04:13 -07:00 · 2003-09-14 15:04:13 -07:00 · 0a3697ad65
commit 0a3697ad65
parent ebea9e7e0b
13 changed files with 485 additions and 245 deletions
--- a/11
+++ b/11
@ -7,8 +7,6 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.

 * Changes in Wget 1.9.

-** The build process now requires Autoconf 2.5x.
-
 ** It is now possible to specify that POST method be used for HTTP
 requests.  For example, `wget --post-data="id=foo&data=bar" URL' will
 send a POST request with the specified contents.
@ -32,6 +30,15 @@ considered a fatal error.

 ** The new option `--dns-cache=off' may be used to prevent Wget from
 caching DNS lookups.
+
+** The build process now requires Autoconf 2.5x.
+
+** Wget no longer quotes characters in local file names that would be
+considered "unsafe" as part of URL.  Quoting can still occur for
+control characters or for '/', but no longer for frequent characters
+such as space.  You can use the new option --restrict-file-names to
+enforce even stricter rules, which is useful when downloading to
+Windows partitions.

 * Wget 1.8.1 is a bugfix release with no user-visible changes.

--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@ -1,3 +1,8 @@
+2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* wget.texi (Download Options): Document the new option
+	--restrict-file-names and the corresponding wgetrc command.
+
 2003-09-10  Hrvoje Niksic  <hniksic@xemacs.org>

 	* wget.texi (Download Options): Documented new option --dns-cache.
--- a/doc/wget.texi
+++ b/doc/wget.texi
@ -800,6 +800,39 @@ lookups where they're probably not needed.

 If you don't understand the above description, you probably won't need
 this option.
+
+@cindex file names, restrict
+@cindex Windows file names
+@itemx --restrict-file-names=none|unix|windows
+Restrict characters that may occur in local file names created by Wget
+from remote URLs.  Characters that are considered @dfn{unsafe} under a
+set of restrictions are escaped, i.e. replaced with @samp{%XX}, where
+@samp{XX} is the hexadecimal code of the character.
+
+The default for this option depends on the operating system: on Unix and
+Unix-like OS'es, it defaults to ``unix''.  Under Windows and Cygwin, it
+defaults to ``windows''.  Changing the default is useful when you are
+using a non-native partition, e.g. when downloading files to a Windows
+partition mounted from Linux, or when using NFS-mounted or SMB-mounted
+Windows drives.
+
+When set to ``none'', the only characters that are quoted are those that
+are impossible to get into a file name---the NUL character and @samp{/}.
+The control characters, newline, etc. are all placed into file names.
+
+When set to ``unix'', additional unsafe characters are those in the
+0--31 range and in the 128--159 range.  This is because those characters
+are typically not printable.
+
+When set to ``windows'', all of the above are quoted, along with
+@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<},
+and @samp{>}.  Additionally, Wget in Windows mode uses @samp{+} instead
+of @samp{:} to separate host and port in local file names, and uses
+@samp{@@} instead of @samp{?} to separate the query portion of the file
+name from the rest.  Therefore, a URL that would be saved as
+@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be
+saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows
+mode.
@end table

@node Directory Options, HTTP Options, Download Options, Invoking
@ -2241,6 +2274,10 @@ Links}).
 If set to on, remove @sc{ftp} listings downloaded by Wget.  Setting it
 to off is the same as @samp{-nr}.

+@item restrict_file_names = off/unix/windows
+Restrict the file names generated by Wget from URLs.  See
+@samp{--restrict-file-names} for a more detailed description.
+
@item retr_symlinks = on/off
 When set to on, retrieve symbolic links as if they were plain files; the
 same as @samp{--retr-symlinks}.
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,31 @@
+2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* url.c (append_uri_pathel): Use opt.restrict_file_names when
+	calling file_unsafe_char.
+
+	* init.c: New command restrict_file_names.
+
+	* main.c (main): New option --restrict-file-names[=windows,unix].
+
+	* url.c (url_file_name): Renamed from url_filename.
+	(url_file_name): Add directory and hostdir prefix here, not in
+	mkstruct.
+	(append_dir_structure): New function, does part of the work that
+	used to be in mkstruct.  Iterates over path elements in u->path,
+	calling append_uri_pathel on each one to append it to the file
+	name.
+	(append_uri_pathel): URL-unescape a path element and reencode it
+	with a different set of rules, more appropriate for handling of
+	files.
+	(file_unsafe_char): New function, uses a lookup table to decide
+	whether a character should be escaped for use in file name.
+	(append_string): New utility function.
+	(append_char): Ditto.
+	(file_unsafe_char): New argument restrict_for_windows, decide
+	whether Windows file names should be escaped in run-time.
+
+	* connect.c: Include <stdlib.h> to get prototype for abort().
+
 2003-09-14  Hrvoje Niksic  <hniksic@xemacs.org>

 	* utils.c (wtimer_sys_set): Extracted the code that sets the
--- a/src/connect.c
+++ b/src/connect.c
@ -30,6 +30,7 @@ so, delete this exception statement from your version.  */
 #include <config.h>

 #include <stdio.h>
+#include <stdlib.h>
 #include <sys/types.h>
 #ifdef HAVE_UNISTD_H
 # include <unistd.h>
--- a/src/ftp-ls.c
+++ b/src/ftp-ls.c
@ -842,8 +842,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
    {
      char *tmpu, *tmpp;        /* temporary, clean user and passwd */

-      tmpu = encode_string (u->user);
-      tmpp = u->passwd ? encode_string (u->passwd) : NULL;
+      tmpu = url_escape (u->user);
+      tmpp = u->passwd ? url_escape (u->passwd) : NULL;
      upwd = (char *)xmalloc (strlen (tmpu)
 			     + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
      sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
@ -863,7 +863,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f)
      fprintf (fp, "  ");
      if (f->tstamp != -1)
 	{
-	  /* #### Should we translate the months? */
+	  /* #### Should we translate the months?  Or, even better, use
+	     ISO 8601 dates?  */
 	  static char *months[] = {
 	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
--- a/src/ftp.c
+++ b/src/ftp.c
@ -1025,7 +1025,7 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con)
  struct stat st;

  if (!con->target)
-    con->target = url_filename (u);
+    con->target = url_file_name (u);

  if (opt.noclobber && file_exists_p (con->target))
    {
@ -1245,7 +1245,7 @@ ftp_get_listing (struct url *u, ccon *con, struct fileinfo **f)
  /* Find the listing file name.  We do it by taking the file name of
     the URL and replacing the last component with the listing file
     name.  */
-  uf = url_filename (u);
+  uf = url_file_name (u);
  lf = file_merge (uf, LIST_FILENAME);
  xfree (uf);
  DEBUGP ((_("Using `%s' as listing tmp file.\n"), lf));
@ -1335,7 +1335,7 @@ ftp_retrieve_list (struct url *u, struct fileinfo *f, ccon *con)
      ofile = xstrdup (u->file);
      url_set_file (u, f->name);

-      con->target = url_filename (u);
+      con->target = url_file_name (u);
      err = RETROK;

      dlthis = 1;
@ -1723,7 +1723,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
 	      char *filename = (opt.output_document
 				? xstrdup (opt.output_document)
 				: (con.target ? xstrdup (con.target)
-				   : url_filename (u)));
+				   : url_file_name (u)));
 	      res = ftp_index (filename, u, f);
 	      if (res == FTPOK && opt.verbose)
 		{
--- a/src/http.c
+++ b/src/http.c
@ -1614,12 +1614,12 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
    hstat.local_file = local_file;
  else if (local_file)
    {
-      *local_file = url_filename (u);
+      *local_file = url_file_name (u);
      hstat.local_file = local_file;
    }
  else
    {
-      dummy = url_filename (u);
+      dummy = url_file_name (u);
      hstat.local_file = &dummy;
    }

--- a/src/init.c
+++ b/src/init.c
@ -100,6 +100,7 @@ CMD_DECLARE (cmd_spec_htmlify);
 CMD_DECLARE (cmd_spec_mirror);
 CMD_DECLARE (cmd_spec_progress);
 CMD_DECLARE (cmd_spec_recursive);
+CMD_DECLARE (cmd_spec_restrict_file_names);
 CMD_DECLARE (cmd_spec_useragent);

 /* List of recognized commands, each consisting of name, closure and function.
@ -188,6 +189,7 @@ static struct {
  { "reject",		&opt.rejects,		cmd_vector },
  { "relativeonly",	&opt.relative_only,	cmd_boolean },
  { "removelisting",	&opt.remove_listing,	cmd_boolean },
+  { "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names },
  { "retrsymlinks",	&opt.retr_symlinks,	cmd_boolean },
  { "retryconnrefused",	&opt.retry_connrefused,	cmd_boolean },
  { "robots",		&opt.use_robots,	cmd_boolean },
@ -281,6 +283,13 @@ defaults (void)
  opt.dots_in_line = 50;

  opt.dns_cache = 1;
+
+  /* The default for file name restriction defaults to the OS type. */
+#if !defined(WINDOWS) && !defined(__CYGWIN__)
+  opt.restrict_file_names = restrict_shell;
+#else
+  opt.restrict_file_names = restrict_windows;
+#endif
 }

 /* Return the user's home directory (strdup-ed), or NULL if none is
@ -1008,6 +1017,26 @@ cmd_spec_recursive (const char *com, const char *val, void *closure)
  return 1;
 }

+static int
+cmd_spec_restrict_file_names (const char *com, const char *val, void *closure)
+{
+  /* The currently accepted values are `none', `unix', and
+     `windows'.  */
+  if (0 == strcasecmp (val, "none"))
+    opt.restrict_file_names = restrict_none;
+  else if (0 == strcasecmp (val, "unix"))
+    opt.restrict_file_names = restrict_shell;
+  else if (0 == strcasecmp (val, "windows"))
+    opt.restrict_file_names = restrict_windows;
+  else
+    {
+      fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"),
+	       exec_name, com, val);
+      return 0;
+    }
+  return 1;
+}
+
 static int
 cmd_spec_useragent (const char *com, const char *val, void *closure)
 {
--- a/src/main.c
+++ b/src/main.c
@ -179,10 +179,11 @@ Download:\n\
       --bind-address=ADDRESS   bind to ADDRESS (hostname or IP) on local host.\n\
       --limit-rate=RATE        limit download rate to RATE.\n\
       --dns-cache=off          disable caching DNS lookups.\n\
+       --restrict-file-names=MODE restrict chars in file names to MODE.\n\
 \n"), stdout);
  fputs (_("\
 Directories:\n\
-  -nd  --no-directories            don\'t create directories.\n\
+  -nd, --no-directories            don\'t create directories.\n\
  -x,  --force-directories         force creation of directories.\n\
  -nH, --no-host-directories       don\'t create host directories.\n\
  -P,  --directory-prefix=PREFIX   save files to PREFIX/...\n\
@ -344,6 +345,7 @@ main (int argc, char *const *argv)
    { "proxy-user", required_argument, NULL, 143 },
    { "quota", required_argument, NULL, 'Q' },
    { "reject", required_argument, NULL, 'R' },
+    { "restrict-file-names", required_argument, NULL, 176 },
    { "save-cookies", required_argument, NULL, 162 },
    { "timeout", required_argument, NULL, 'T' },
    { "tries", required_argument, NULL, 't' },
@ -610,6 +612,9 @@ GNU General Public License for more details.\n"));
 	case 175:
 	  setval ("dnscache", optarg);
 	  break;
+	case 176:
+	  setval ("restrictfilenames", optarg);
+	  break;
 	case 'A':
 	  setval ("accept", optarg);
 	  break;
--- a/src/options.h
+++ b/src/options.h
@ -184,6 +184,12 @@ struct options

  char *post_data;		/* POST query string */
  char *post_file_name;		/* File to post */
+
+  enum {
+    restrict_none,
+    restrict_shell,
+    restrict_windows
+  } restrict_file_names;	/* whether we restrict file name chars. */
 };

 extern struct options opt;
--- a/src/url.c
+++ b/src/url.c
@ -1,5 +1,6 @@
 /* URL handling.
-   Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
+   Free Software Foundation, Inc.

 This file is part of GNU Wget.

@ -95,24 +96,22 @@ static int path_simplify PARAMS ((char *));
   code assumes ASCII character set and 8-bit chars.  */

 enum {
+  /* rfc1738 reserved chars, preserved from encoding.  */
  urlchr_reserved = 1,
+
+  /* rfc1738 unsafe chars, plus some more.  */
  urlchr_unsafe   = 2
 };

+#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
+#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
+#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
+
+/* Shorthands for the table: */
 #define R  urlchr_reserved
 #define U  urlchr_unsafe
 #define RU R|U

-#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
-
-/* rfc1738 reserved chars, preserved from encoding.  */
-
-#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
-
-/* rfc1738 unsafe chars, plus some more.  */
-
-#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
-
 const static unsigned char urlchr_table[256] =
 {
  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
@ -142,6 +141,9 @@ const static unsigned char urlchr_table[256] =
  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 };
+#undef R
+#undef U
+#undef RU

 /* Decodes the forms %xy in a URL to the character the hexadecimal
   code of which is xy.  xy are hexadecimal digits from
@ -150,7 +152,7 @@ const static unsigned char urlchr_table[256] =
   literally.  */

 static void
-decode_string (char *s)
+url_unescape (char *s)
 {
  char *t = s;			/* t - tortoise */
  char *h = s;			/* h - hare     */
@ -175,10 +177,10 @@ decode_string (char *s)
  *t = '\0';
 }

-/* Like encode_string, but return S if there are no unsafe chars.  */
+/* Like url_escape, but return S if there are no unsafe chars.  */

 static char *
-encode_string_maybe (const char *s)
+url_escape_allow_passthrough (const char *s)
 {
  const char *p1;
  char *p2, *newstr;
@ -186,7 +188,7 @@ encode_string_maybe (const char *s)
  int addition = 0;

  for (p1 = s; *p1; p1++)
-    if (UNSAFE_CHAR (*p1))
+    if (URL_UNSAFE_CHAR (*p1))
      addition += 2;		/* Two more characters (hex digits) */

  if (!addition)
@ -199,7 +201,7 @@ encode_string_maybe (const char *s)
  p2 = newstr;
  while (*p1)
    {
-      if (UNSAFE_CHAR (*p1))
+      if (URL_UNSAFE_CHAR (*p1))
 	{
 	  unsigned char c = *p1++;
 	  *p2++ = '%';
@ -215,13 +217,13 @@ encode_string_maybe (const char *s)
  return newstr;
 }

-/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
+/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
   given string, returning a malloc-ed %XX encoded string.  */
  
 char *
-encode_string (const char *s)
+url_escape (const char *s)
 {
-  char *encoded = encode_string_maybe (s);
+  char *encoded = url_escape_allow_passthrough (s);
  if (encoded != s)
    return encoded;
  else
@ -233,7 +235,7 @@ encode_string (const char *s)
   allocated storage.  */

 #define ENCODE(ptr) do {				\
-  char *e_new = encode_string_maybe (ptr);	\
+  char *e_new = url_escape_allow_passthrough (ptr);	\
  if (e_new != ptr)					\
    {							\
      xfree (ptr);					\
@ -258,7 +260,7 @@ decide_copy_method (const char *p)
 	  char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 	    XCHAR_TO_XDIGIT (*(p + 2));

-	  if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
+	  if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 	    return CM_PASSTHROUGH;
 	  else
 	    return CM_DECODE;
@ -267,20 +269,20 @@ decide_copy_method (const char *p)
 	/* Garbled %.. sequence: encode `%'. */
 	return CM_ENCODE;
    }
-  else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
+  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
    return CM_ENCODE;
  else
    return CM_PASSTHROUGH;
 }

-/* Translate a %-quoting (but possibly non-conformant) input string S
-   into a %-quoting (and conformant) output string.  If no characters
+/* Translate a %-escaped (but possibly non-conformant) input string S
+   into a %-escaped (and conformant) output string.  If no characters
   are encoded or decoded, return the same string S; otherwise, return
   a freshly allocated string with the new contents.

   After a URL has been run through this function, the protocols that
   use `%' as the quote character can use the resulting string as-is,
-   while those that don't call decode_string() to get to the intended
+   while those that don't call url_unescape() to get to the intended
   data.  This function is also stable: after an input string is
   transformed the first time, all further transformations of the
   result yield the same result string.
@ -293,20 +295,21 @@ decide_copy_method (const char *p)

       GET /abc%20def HTTP/1.0

-   So it appears that the unsafe chars need to be quoted, as with
-   encode_string.  But what if we're requested to download
-   `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
-   the user meant was a literal space, and he was kind enough to quote
-   it.  In that case, Wget should obviously leave the `%20' as is, and
-   send the same request as above.  So in this case we may not call
-   encode_string.
+   It appears that the unsafe chars need to be quoted, for example
+   with url_escape.  But what if we're requested to download
+   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
+   us with `abc%2520def'.  This is incorrect -- since %-escapes are
+   part of URL syntax, "%20" is the correct way to denote a literal
+   space on the Wget command line.  This leaves us in the conclusion
+   that in that case Wget should not call url_escape, but leave the
+   `%20' as is.

-   But what if the requested URI is `abc%20 def'?  If we call
-   encode_string, we end up with `/abc%2520%20def', which is almost
-   certainly not intended.  If we don't call encode_string, we are
-   left with the embedded space and cannot send the request.  What the
+   And what if the requested URI is `abc%20 def'?  If we call
+   url_escape, we end up with `/abc%2520%20def', which is almost
+   certainly not intended.  If we don't call url_escape, we are left
+   with the embedded space and cannot complete the request.  What the
   user meant was for Wget to request `/abc%20%20def', and this is
-   where reencode_string kicks in.
+   where reencode_escapes kicks in.

   Wget used to solve this by first decoding %-quotes, and then
   encoding all the "unsafe" characters found in the resulting string.
@ -317,7 +320,7 @@ decide_copy_method (const char *p)
   is inevitable because by the second step we would lose information
   on whether the `+' was originally encoded or not.  Both results
   were wrong because in CGI parameters + means space, while %2B means
-   literal plus.  reencode_string correctly translates the above to
+   literal plus.  reencode_escapes correctly translates the above to
   "a%2B+b", i.e. returns the original string.

   This function uses an algorithm proposed by Anon Sricharoenchai:
@ -352,7 +355,7 @@ decide_copy_method (const char *p)
   "foo%2b+bar"      -> "foo%2b+bar"  */

 static char *
-reencode_string (const char *s)
+reencode_escapes (const char *s)
 {
  const char *p1;
  char *newstr, *p2;
@ -417,12 +420,12 @@ reencode_string (const char *s)
  return newstr;
 }

-/* Run PTR_VAR through reencode_string.  If a new string is consed,
+/* Run PTR_VAR through reencode_escapes.  If a new string is consed,
   free PTR_VAR and make it point to the new storage.  Obviously,
   PTR_VAR needs to be an lvalue.  */

 #define REENCODE(ptr_var) do {			\
-  char *rf_new = reencode_string (ptr_var);	\
+  char *rf_new = reencode_escapes (ptr_var);	\
  if (rf_new != ptr_var)			\
    {						\
      xfree (ptr_var);				\
@ -544,9 +547,9 @@ parse_uname (const char *str, int len, char **user, char **passwd)
  (*user)[len] = '\0';

  if (*user)
-    decode_string (*user);
+    url_unescape (*user);
  if (*passwd)
-    decode_string (*passwd);
+    url_unescape (*passwd);

  return 1;
 }
@ -611,6 +614,10 @@ rewrite_shorthand_url (const char *url)

 static void parse_path PARAMS ((const char *, char **, char **));

+/* Like strpbrk, with the exception that it returns the pointer to the
+   terminating zero (end-of-string aka "eos") if no matching character
+   is found.  */
+
 static char *
 strpbrk_or_eos (const char *s, const char *accept)
 {
@ -825,7 +832,7 @@ url_parse (const char *url, int *error)
      return NULL;
    }

-  url_encoded = reencode_string (url);
+  url_encoded = reencode_escapes (url);
  p = url_encoded;

  p += strlen (supported_schemes[scheme].leading_string);
@ -1032,13 +1039,13 @@ url_error (int error_code)
  return parse_errors[error_code];
 }

-static void
-parse_path (const char *quoted_path, char **dir, char **file)
-{
-  char *path, *last_slash;
+/* Parse PATH into dir and file.  PATH is extracted from the URL and
+   is URL-escaped.  The function returns unescaped DIR and FILE.  */

-  STRDUP_ALLOCA (path, quoted_path);
-  decode_string (path);
+static void
+parse_path (const char *path, char **dir, char **file)
+{
+  char *last_slash;

  last_slash = strrchr (path, '/');
  if (!last_slash)
@ -1051,6 +1058,8 @@ parse_path (const char *quoted_path, char **dir, char **file)
      *dir = strdupdelim (path, last_slash);
      *file = xstrdup (last_slash + 1);
    }
+  url_unescape (*dir);
+  url_unescape (*file);
 }

 /* Note: URL's "full path" is the path with the query string and
@ -1303,8 +1312,6 @@ rotate_backups(const char *fname)
    {
      sprintf (from, "%s.%d", fname, i - 1);
      sprintf (to, "%s.%d", fname, i);
-      /* #### This will fail on machines without the rename() system
-         call.  */
      rename (from, to);
    }

@ -1323,11 +1330,14 @@ mkalldirs (const char *path)
  int res;

  p = path + strlen (path);
-  for (; *p != '/' && p != path; p--);
+  for (; *p != '/' && p != path; p--)
+    ;
+
  /* Don't create if it's just a file.  */
  if ((p == path) && (*p != '/'))
    return 0;
  t = strdupdelim (path, p);
+
  /* Check whether the directory exists.  */
  if ((stat (t, &st) == 0))
    {
@ -1360,194 +1370,302 @@ mkalldirs (const char *path)
  xfree (t);
  return res;
 }
+
+/* Functions for constructing the file name out of URL components.  */

-static int
-count_slashes (const char *s)
+/* A growable string structure, used by url_file_name and friends.
+   This should perhaps be moved to utils.c.
+
+   The idea is to have an easy way to construct a string by having
+   various functions append data to it.  Instead of passing the
+   obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
+   questions, we pass the pointer to this struct.  */
+
+struct growable {
+  char *base;
+  int size;
+  int tail;
+};
+
+/* Ensure that the string can accept APPEND_COUNT more characters past
+   the current TAIL position.  If necessary, this will grow the string
+   and update its allocated size.  If the string is already large
+   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
+#define GROW(g, append_size) do {					\
+  struct growable *G_ = g;						\
+  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);	\
+} while (0)
+
+/* Return the tail position of the string. */
+#define TAIL(r) ((r)->base + (r)->tail)
+
+/* Move the tail position by APPEND_COUNT characters. */
+#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
+
+/* Append the string STR to DEST.  NOTICE: the string in DEST is not
+   terminated.  */
+
+static void
+append_string (const char *str, struct growable *dest)
 {
-  int i = 0;
-  while (*s)
-    if (*s++ == '/')
-      ++i;
-  return i;
+  int l = strlen (str);
+  GROW (dest, l);
+  memcpy (TAIL (dest), str, l);
+  TAIL_INCR (dest, l);
 }

-/* Return the path name of the URL-equivalent file name, with a
-   remote-like structure of directories.  */
-static char *
-mkstruct (const struct url *u)
-{
-  char *dir, *file;
-  char *res, *dirpref;
-  int l;
+/* Append CH to DEST.  For example, append_char (0, DEST)
+   zero-terminates DEST.  */

-  if (opt.cut_dirs)
+static void
+append_char (char ch, struct growable *dest)
 {
-      char *ptr = u->dir + (*u->dir == '/');
-      int slash_count = 1 + count_slashes (ptr);
-      int cut = MINVAL (opt.cut_dirs, slash_count);
-      for (; cut && *ptr; ptr++)
-	if (*ptr == '/')
-	  --cut;
-      STRDUP_ALLOCA (dir, ptr);
+  GROW (dest, 1);
+  *TAIL (dest) = ch;
+  TAIL_INCR (dest, 1);
+}
+
+enum {
+  filechr_unsafe_always  = 1,	/* always unsafe, e.g. / or \0 */
+  filechr_unsafe_shell   = 2,	/* unsafe for shell use, e.g. control chars */
+  filechr_unsafe_windows = 2,	/* disallowed on Windows file system */
+};
+
+#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
+
+/* Shorthands for the table: */
+#define A filechr_unsafe_always
+#define S filechr_unsafe_shell
+#define W filechr_unsafe_windows
+
+/* Forbidden chars:
+
+   always: \0, /
+   Unix shell: 0-31, 128-159
+   Windows:    \, |, /, <, >, ?, :
+
+   Arguably we could also claim `%' to be unsafe, since we use it as
+   the escape character.  If we ever want to be able to reliably
+   translate file name back to URL, this would become important
+   crucial.  Right now, it's better to be minimal in escaping.  */
+
+const static unsigned char filechr_table[256] =
+{
+  A,  S,  S,  S,   S,  S,  S,  S,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  S,  S,  S,  S,   S,  S,  S,  S,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+  0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
+  0,  0,  W,  0,   0,  0,  0,  A,   /* (   )   *   +    ,   -   .   /   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
+  0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
+  0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
+
+  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 128-143 */
+  S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 144-159 */
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+};
+
+/* Return non-zero if character CH is unsafe for use in file or
+   directory name.  Called by append_uri_pathel. */
+
+static inline int
+file_unsafe_char (char ch, int restrict)
+{
+  int mask = filechr_unsafe_always;
+  if (restrict == restrict_shell)
+    mask |= filechr_unsafe_shell;
+  else if (restrict == restrict_windows)
+    mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
+  return FILE_CHAR_TEST (ch, mask);
+}
+
+/* FN_PORT_SEP is the separator between host and port in file names
+   for non-standard port numbers.  On Unix this is normally ':', as in
+   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
+   because Windows can't handle ':' in file names.  */
+#define FN_PORT_SEP  (opt.restrict_file_names != restrict_windows ? ':' : '+')
+
+/* FN_QUERY_SEP is the separator between the file name and the URL
+   query, normally '?'.  Since Windows cannot handle '?' as part of
+   file name, we use '@' instead there.  */
+#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
+
+/* Quote path element, characters in [b, e), as file name, and append
+   the quoted string to DEST.  Each character is quoted as per
+   file_unsafe_char and the corresponding table.  */
+
+static void
+append_uri_pathel (const char *b, const char *e, struct growable *dest)
+{
+  char *pathel;
+  int pathlen;
+
+  const char *p;
+  int quoted, outlen;
+
+  /* Currently restrict_for_windows is determined at compile time
+     only.  But some users download files to Windows partitions; they
+     should be able to say --windows-file-names so Wget escapes
+     characters invalid on Windows.  Similar run-time restrictions for
+     other file systems can be implemented.  */
+  const int restrict = opt.restrict_file_names;
+
+  /* Copy [b, e) to PATHEL and URL-unescape it. */
+  BOUNDED_TO_ALLOCA (b, e, pathel);
+  url_unescape (pathel);
+  pathlen = strlen (pathel);
+
+  /* Go through PATHEL and check how many characters we'll need to
+     add for file quoting. */
+  quoted = 0;
+  for (p = pathel; *p; p++)
+    if (file_unsafe_char (*p, restrict))
+      ++quoted;
+
+  /* p - pathel is the string length.  Each quoted char means two
+     additional characters in the string, hence 2*quoted.  */
+  outlen = (p - pathel) + (2 * quoted);
+  GROW (dest, outlen);
+
+  if (!quoted)
+    {
+      /* If there's nothing to quote, we don't need to go through the
+	 string the second time.  */
+      memcpy (TAIL (dest), pathel, outlen);
    }
  else
-    dir = u->dir + (*u->dir == '/');
-
-  /* Check for the true name (or at least a consistent name for saving
-     to directory) of HOST, reusing the hlist if possible.  */
-  if (opt.add_hostdir)
    {
-      /* Add dir_prefix and hostname (if required) to the beginning of
-	 dir.  */
-      dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
-				+ strlen (u->host)
-				+ 1 + numdigit (u->port)
-				+ 1);
-      if (!DOTP (opt.dir_prefix))
-	sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
+      char *q = TAIL (dest);
+      for (p = pathel; *p; p++)
+	{
+	  if (!file_unsafe_char (*p, restrict))
+	    *q++ = *p;
 	  else
-	strcpy (dirpref, u->host);
-
-      if (u->port != scheme_default_port (u->scheme))
 	    {
-	  int len = strlen (dirpref);
-	  dirpref[len] = ':';
-	  number_to_string (dirpref + len + 1, u->port);
+	      unsigned char ch = *p;
+	      *q++ = '%';
+	      *q++ = XDIGIT_TO_XCHAR (ch >> 4);
+	      *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
 	    }
 	}
-  else				/* not add_hostdir */
+      assert (q - TAIL (dest) == outlen);
+    }
+  TAIL_INCR (dest, outlen);
+}
+
+/* Append to DEST the directory structure that corresponds the
+   directory part of URL's path.  For example, if the URL is
+   http://server/dir1/dir2/file, this appends "/dir1/dir2".
+
+   Each path element ("dir1" and "dir2" in the above example) is
+   examined, url-unescaped, and re-escaped as file name element.
+
+   Additionally, it cuts as many directories from the path as
+   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
+   will produce "bar" for the above example.  For 2 or more, it will
+   produce "".
+
+   Each component of the path is quoted for use as file name.  */
+
+static void
+append_dir_structure (const struct url *u, struct growable *dest)
 {
-      if (!DOTP (opt.dir_prefix))
-	dirpref = opt.dir_prefix;
-      else
-	dirpref = "";
-    }
+  char *pathel, *next;
+  int cut = opt.cut_dirs;

-  /* If there is a prefix, prepend it.  */
-  if (*dirpref)
+  /* Go through the path components, de-URL-quote them, and quote them
+     (if necessary) as file names.  */
+
+  pathel = u->path;
+  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
    {
-      char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
-      sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
-      dir = newdir;
-    }
+      if (cut-- > 0)
+	continue;
+      if (pathel == next)
+	/* Ignore empty pathels.  path_simplify should remove
+	   occurrences of "//" from the path, but it has special cases
+	   for starting / which generates an empty pathel here.  */
+	continue;

-  l = strlen (dir);
-  if (l && dir[l - 1] == '/')
-    dir[l - 1] = '\0';
-
-  if (!*u->file)
-    file = "index.html";
-  else
-    file = u->file;
-
-  /* Finally, construct the full name.  */
-  res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
-			 + 1);
-  sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
-
-  return res;
-}
-
-/* Compose a file name out of BASE, an unescaped file name, and QUERY,
-   an escaped query string.  The trick is to make sure that unsafe
-   characters in BASE are escaped, and that slashes in QUERY are also
-   escaped.  */
-
-static char *
-compose_file_name (char *base, char *query)
-{
-  char result[256];
-  char *from;
-  char *to = result;
-
-  /* Copy BASE to RESULT and encode all unsafe characters.  */
-  from = base;
-  while (*from && to - result < sizeof (result))
-    {
-      if (UNSAFE_CHAR (*from))
-	{
-	  unsigned char c = *from++;
-	  *to++ = '%';
-	  *to++ = XDIGIT_TO_XCHAR (c >> 4);
-	  *to++ = XDIGIT_TO_XCHAR (c & 0xf);
-	}
-      else
-	*to++ = *from++;
-    }
-
-  if (query && to - result < sizeof (result))
-    {
-      *to++ = '?';
-
-      /* Copy QUERY to RESULT and encode all '/' characters. */
-      from = query;
-      while (*from && to - result < sizeof (result))
-	{
-	  if (*from == '/')
-	    {
-	      *to++ = '%';
-	      *to++ = '2';
-	      *to++ = 'F';
-	      ++from;
-	    }
-	  else
-	    *to++ = *from++;
+      if (dest->tail)
+	append_char ('/', dest);
+      append_uri_pathel (pathel, next, dest);
    }
 }

-  if (to - result < sizeof (result))
-    *to = '\0';
-  else
-    /* Truncate input which is too long, presumably due to a huge
-       query string.  */
-    result[sizeof (result) - 1] = '\0';
+/* Return a unique file name that matches the given URL as good as
+   possible.  Does not create directories on the file system.  */

-  return xstrdup (result);
-}
-
-/* Create a unique filename, corresponding to a given URL.  Calls
-   mkstruct if necessary.  Does *not* actually create any directories.  */
 char *
-url_filename (const struct url *u)
+url_file_name (const struct url *u)
 {
-  char *file, *name;
+  struct growable fnres;

-  char *query = u->query && *u->query ? u->query : NULL;
+  char *u_file, *u_query;
+  char *fname, *unique;

+  fnres.base = NULL;
+  fnres.size = 0;
+  fnres.tail = 0;
+
+  /* Start with the directory prefix, if specified. */
+  if (!DOTP (opt.dir_prefix))
+    append_string (opt.dir_prefix, &fnres);
+
+  /* If "dirstruct" is turned on (typically the case with -r), add
+     the host and port (unless those have been turned off) and
+     directory structure.  */
  if (opt.dirstruct)
    {
-      char *base = mkstruct (u);
-      file = compose_file_name (base, query);
-      xfree (base);
-    }
-  else
+      if (opt.add_hostdir)
 	{
-      char *base = *u->file ? u->file : "index.html";
-      file = compose_file_name (base, query);
-
-      /* Check whether the prefix directory is something other than "."
-	 before prepending it.  */
-      if (!DOTP (opt.dir_prefix))
+	  if (fnres.tail)
+	    append_char ('/', &fnres);
+	  append_string (u->host, &fnres);
+	  if (u->port != scheme_default_port (u->scheme))
 	    {
-	  /* #### should just realloc FILE and prepend dir_prefix. */
-	  char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
-					 + 1 + strlen (file) + 1);
-	  sprintf (nfile, "%s/%s", opt.dir_prefix, file);
-	  xfree (file);
-	  file = nfile;
+	      char portstr[24];
+	      number_to_string (portstr, u->port);
+	      append_char (FN_PORT_SEP, &fnres);
+	      append_string (portstr, &fnres);
 	    }
 	}

-  /* DOS-ish file systems don't like `%' signs in them; we change it
-     to `@'.  */
-#ifdef WINDOWS
-  {
-    char *p = file;
-    for (p = file; *p; p++)
-      if (*p == '%')
-	*p = '@';
+      append_dir_structure (u, &fnres);
    }
-#endif /* WINDOWS */
+
+  /* Add the file name. */
+  if (fnres.tail)
+    append_char ('/', &fnres);
+  u_file = *u->file ? u->file : "index.html";
+  append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
+
+  /* Append "?query" to the file name. */
+  u_query = u->query && *u->query ? u->query : NULL;
+  if (u_query)
+    {
+      append_char (FN_QUERY_SEP, &fnres);
+      append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
+    }
+
+  /* Zero-terminate the file name. */
+  append_char ('\0', &fnres);
+
+  fname = fnres.base;

  /* Check the cases in which the unique extensions are not used:
     1) Clobbering is turned off (-nc).
@ -1557,17 +1675,18 @@ url_filename (const struct url *u)

     The exception is the case when file does exist and is a
     directory (actually support for bad httpd-s).  */
+
  if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
-      && !(file_exists_p (file) && !file_non_directory_p (file)))
-    return file;
+      && !(file_exists_p (fname) && !file_non_directory_p (fname)))
+    return fnres.base;

  /* Find a unique name.  */
-  name = unique_name (file);
-  xfree (file);
-  return name;
+  unique = unique_name (fname);
+  xfree (fname);
+  return unique;
 }

-/* Return the langth of URL's path.  Path is considered to be
+/* Return the length of URL's path.  Path is considered to be
   terminated by one of '?', ';', '#', or by the end of the
   string.  */
 static int
@ -1680,8 +1799,10 @@ path_simplify (char *path)
      else if (*p == '/')
 	{
 	  /* Remove empty path elements.  Not mandated by rfc1808 et
-	     al, but empty path elements are not all that useful, and
-	     the rest of Wget might not deal with them well. */
+	     al, but it seems like a good idea to get rid of them.
+	     Supporting them properly is hard (in which directory do
+	     you save http://x.com///y.html?) and they don't seem to
+	     bring much gain.  */
 	  char *q = p;
 	  while (*q == '/')
 	    ++q;
@ -1964,13 +2085,13 @@ url_string (const struct url *url, int hide_password)
  /* Make sure the user name and password are quoted. */
  if (url->user)
    {
-      quoted_user = encode_string_maybe (url->user);
+      quoted_user = url_escape_allow_passthrough (url->user);
      if (url->passwd)
 	{
 	  if (hide_password)
 	    quoted_passwd = HIDDEN_PASSWORD;
 	  else
-	    quoted_passwd = encode_string_maybe (url->passwd);
+	    quoted_passwd = url_escape_allow_passthrough (url->passwd);
 	}
    }

--- a/src/url.h
+++ b/src/url.h
@ -130,7 +130,7 @@ typedef enum

 /* Function declarations */

-char *encode_string PARAMS ((const char *));
+char *url_escape PARAMS ((const char *));

 struct url *url_parse PARAMS ((const char *, int *));
 const char *url_error PARAMS ((int));
@ -157,7 +157,7 @@ char *uri_merge PARAMS ((const char *, const char *));

 void rotate_backups PARAMS ((const char *));
 int mkalldirs PARAMS ((const char *));
-char *url_filename PARAMS ((const struct url *));
+char *url_file_name PARAMS ((const struct url *));

 char *getproxy PARAMS ((struct url *));
 int no_proxy_match PARAMS ((const char *, const char **));