From 61bb00adc0a2da373646e5cf8cef1ac980a3337f Mon Sep 17 00:00:00 2001 From: hniksic Date: Fri, 13 Apr 2001 21:11:35 -0700 Subject: [PATCH] [svn] Various url.c-related changes. Published in . * retr.c (retrieve_url): Call uri_merge, not url_concat. * html-url.c (collect_tags_mapper): Call uri_merge, not url_concat. * url.c (mkstruct): Use encode_string instead of xstrdup followed by URL_CLEANSE. (path_simplify_with_kludge): Deleted. (contains_unsafe): Deleted. (construct): Renamed to uri_merge_1. (url_concat): Renamed to uri_merge. * url.c (str_url): Use encode_string instead of the unnecessary CLEANDUP. (encode_string_maybe): New function, returns input string if no encoding is needed. (encode_string): Call encode_string_maybe to do the dirty work, xstrdup if no work needed. * wget.h (XDIGIT_TO_xchar): Define here. * url.c (decode_string): Use new name. (encode_string): Ditto. * http.c (XDIGIT_TO_xchar): Rename HEXD2asc to XDIGIT_TO_xchar. (dump_hash): Use new name. * wget.h: Rename ASC2HEXD and HEXD2ASC to XCHAR_TO_XDIGIT and XDIGIT_TO_XCHAR respectively. --- src/ChangeLog | 36 +++++++ src/ftp-ls.c | 4 +- src/html-url.c | 4 +- src/http.c | 8 +- src/retr.c | 2 +- src/url.c | 275 +++++++++++++++++++++++++------------------------ src/url.h | 7 +- src/wget.h | 19 +++- 8 files changed, 200 insertions(+), 155 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 98b34b63..921e0f3e 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,39 @@ +2001-04-14 Hrvoje Niksic + + * retr.c (retrieve_url): Call uri_merge, not url_concat. + + * html-url.c (collect_tags_mapper): Call uri_merge, not + url_concat. + + * url.c (mkstruct): Use encode_string instead of xstrdup followed + by URL_CLEANSE. + (path_simplify_with_kludge): Deleted. + (contains_unsafe): Deleted. + (construct): Renamed to uri_merge_1. + (url_concat): Renamed to uri_merge. + +2001-04-13 Hrvoje Niksic + + * url.c (str_url): Use encode_string instead of the unnecessary + CLEANDUP. + (encode_string_maybe): New function, returns input string if no + encoding is needed. + (encode_string): Call encode_string_maybe to do the dirty work, + xstrdup if no work needed. + +2001-04-13 Hrvoje Niksic + + * wget.h (XDIGIT_TO_xchar): Define here. + + * url.c (decode_string): Use new name. + (encode_string): Ditto. + + * http.c (XDIGIT_TO_xchar): Rename HEXD2asc to XDIGIT_TO_xchar. + (dump_hash): Use new name. + + * wget.h: Rename ASC2HEXD and HEXD2ASC to XCHAR_TO_XDIGIT and + XDIGIT_TO_XCHAR respectively. + 2001-04-13 Hrvoje Niksic * init.c: Include cookies.h. diff --git a/src/ftp-ls.c b/src/ftp-ls.c index b3622189..58d652ae 100644 --- a/src/ftp-ls.c +++ b/src/ftp-ls.c @@ -815,8 +815,8 @@ ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f) { char *tmpu, *tmpp; /* temporary, clean user and passwd */ - tmpu = CLEANDUP (u->user); - tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL; + tmpu = encode_string (u->user); + tmpp = u->passwd ? encode_string (u->passwd) : NULL; upwd = (char *)xmalloc (strlen (tmpu) + (tmpp ? (1 + strlen (tmpp)) : 0) + 2); sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : ""); diff --git a/src/html-url.c b/src/html-url.c index 67a7c229..16d64cb3 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -335,7 +335,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, complete_uri = xstrdup (link_uri); } else - complete_uri = url_concat (base, link_uri); + complete_uri = uri_merge (base, link_uri); DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", closure->document_file, base ? base : "(null)", @@ -420,7 +420,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg) if (closure->base) xfree (closure->base); if (closure->parent_base) - closure->base = url_concat (closure->parent_base, newbase); + closure->base = uri_merge (closure->parent_base, newbase); else closure->base = xstrdup (newbase); } diff --git a/src/http.c b/src/http.c index 83108bce..7b9578d1 100644 --- a/src/http.c +++ b/src/http.c @@ -2046,10 +2046,6 @@ extract_header_attr (const char *au, const char *attr_name, char **ret) return 0; } -/* Response value needs to be in lowercase, so we cannot use HEXD2ASC - from url.h. See RFC 2069 2.1.2 for the syntax of response-digest. */ -#define HEXD2asc(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'a')) - /* Dump the hexadecimal representation of HASH to BUF. HASH should be an array of 16 bytes containing the hash keys, and BUF should be a buffer of 33 writable characters (32 for hex digits plus one for @@ -2061,8 +2057,8 @@ dump_hash (unsigned char *buf, const unsigned char *hash) for (i = 0; i < MD5_HASHLEN; i++, hash++) { - *buf++ = HEXD2asc (*hash >> 4); - *buf++ = HEXD2asc (*hash & 0xf); + *buf++ = XDIGIT_TO_xchar (*hash >> 4); + *buf++ = XDIGIT_TO_xchar (*hash & 0xf); } *buf = '\0'; } diff --git a/src/retr.c b/src/retr.c index 95e663fa..3aae5125 100644 --- a/src/retr.c +++ b/src/retr.c @@ -493,7 +493,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, redirects, but a ton of boneheaded webservers and CGIs out there break the rules and use relative URLs, and popular browsers are lenient about this, so wget should be too. */ - construced_newloc = url_concat (url, mynewloc); + construced_newloc = uri_merge (url, mynewloc); xfree (mynewloc); mynewloc = construced_newloc; diff --git a/src/url.c b/src/url.c index 9b110e6e..55587efd 100644 --- a/src/url.c +++ b/src/url.c @@ -1,5 +1,5 @@ /* URL handling. - Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. + Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. This file is part of Wget. @@ -49,39 +49,34 @@ static char unsafe_char_table[256]; #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)]) -/* If S contains unsafe characters, free it and replace it with a - version that doesn't. */ -#define URL_CLEANSE(s) do \ -{ \ - if (contains_unsafe (s)) \ - { \ - char *uc_tmp = encode_string (s); \ - xfree (s); \ - (s) = uc_tmp; \ - } \ -} while (0) +/* rfc1738 reserved chars. This is too short to warrant a table. We + don't use this yet; preservation of reserved chars will be + implemented when I integrate the new `reencode_string' + function. */ +#define RESERVED_CHAR(c) ( (c) == ';' || (c) == '/' || (c) == '?' \ + || (c) == '@' || (c) == '=' || (c) == '&' \ + || (c) == '+') -/* Is a directory "."? */ +/* Is X "."? */ #define DOTP(x) ((*(x) == '.') && (!*(x + 1))) -/* Is a directory ".."? */ +/* Is X ".."? */ #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2))) -#if 0 -static void path_simplify_with_kludge PARAMS ((char *)); -#endif static int urlpath_length PARAMS ((const char *)); -/* A NULL-terminated list of strings to be recognized as prototypes - (URL schemes). Note that recognized doesn't mean supported -- only - HTTP, HTTPS and FTP are currently supported . +/* A NULL-terminated list of strings to be recognized as protocol + types (URL schemes). Note that recognized doesn't mean supported + -- only HTTP, HTTPS and FTP are currently supported. However, a string that does not match anything in the list will be considered a relative URL. Thus it's important that this list has anything anyone could think of being legal. - There are wild things here. :-) Take a look at - for more - fun. */ + #### This is probably broken. Wget should use other means to + distinguish between absolute and relative URIs in HTML links. + + Take a look at + for more. */ static char *protostrings[] = { "cid:", @@ -170,16 +165,6 @@ init_unsafe_char_table (void) unsafe_char_table[i] = 1; } -/* Returns 1 if the string contains unsafe characters, 0 otherwise. */ -int -contains_unsafe (const char *s) -{ - for (; *s; s++) - if (UNSAFE_CHAR (*s)) - return 1; - return 0; -} - /* Decodes the forms %xy in a URL to the character the hexadecimal code of which is xy. xy are hexadecimal digits from [0123456789ABCDEF] (case-insensitive). If x or y are not @@ -205,43 +190,80 @@ decode_string (char *s) *p = *s; continue; } - *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2)); + *p = (XCHAR_TO_XDIGIT (*(s + 1)) << 4) + XCHAR_TO_XDIGIT (*(s + 2)); s += 2; } } *p = '\0'; } -/* Encode the unsafe characters (as determined by URL_UNSAFE) in a +/* Like encode_string, but return S if there are no unsafe chars. */ + +static char * +encode_string_maybe (const char *s) +{ + const char *p1; + char *p2, *newstr; + int newlen; + int addition = 0; + + for (p1 = s; *p1; p1++) + if (UNSAFE_CHAR (*p1)) + addition += 2; /* Two more characters (hex digits) */ + + if (!addition) + return (char *)s; + + newlen = (p1 - s) + addition; + newstr = (char *)xmalloc (newlen + 1); + + p1 = s; + p2 = newstr; + while (*p1) + { + if (UNSAFE_CHAR (*p1)) + { + const unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XDIGIT_TO_XCHAR (c >> 4); + *p2++ = XDIGIT_TO_XCHAR (c & 0xf); + } + else + *p2++ = *p1++; + } + *p2 = '\0'; + assert (p2 - newstr == newlen); + + return newstr; +} + +/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a given string, returning a malloc-ed %XX encoded string. */ + char * encode_string (const char *s) { - const char *b; - char *p, *res; - int i; - - b = s; - for (i = 0; *s; s++, i++) - if (UNSAFE_CHAR (*s)) - i += 2; /* Two more characters (hex digits) */ - res = (char *)xmalloc (i + 1); - s = b; - for (p = res; *s; s++) - if (UNSAFE_CHAR (*s)) - { - const unsigned char c = *s; - *p++ = '%'; - *p++ = HEXD2ASC (c >> 4); - *p++ = HEXD2ASC (c & 0xf); - } - else - *p++ = *s; - *p = '\0'; - return res; + char *encoded = encode_string_maybe (s); + if (encoded != s) + return encoded; + else + return xstrdup (s); } + +/* Encode unsafe characters in PTR to %xx. If such encoding is done, + the old value of PTR is freed and PTR is made to point to the newly + allocated storage. */ + +#define ENCODE(ptr) do { \ + char *e_new = encode_string_maybe (ptr); \ + if (e_new != ptr) \ + { \ + xfree (ptr); \ + ptr = e_new; \ + } \ +} while (0) -/* Returns the proto-type if URL's protocol is supported, or +/* Returns the protocol type if URL's protocol is supported, or URLUNKNOWN if not. */ uerr_t urlproto (const char *url) @@ -499,14 +521,17 @@ parseurl (const char *url, struct urlinfo *u, int strict) strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir); strcat (u->path, *u->dir ? "/" : ""); strcat (u->path, u->file); - URL_CLEANSE (u->path); + ENCODE (u->path); DEBUGP (("newpath: %s\n", u->path)); /* Create the clean URL. */ u->url = str_url (u, 0); return URLOK; } -/* Special versions of DOTP and DDOTP for parse_dir(). */ +/* Special versions of DOTP and DDOTP for parse_dir(). They work like + DOTP and DDOTP, but they also recognize `?' as end-of-string + delimiter. This is needed for correct handling of query + strings. */ #define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?')) #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \ @@ -652,12 +677,12 @@ str_url (const struct urlinfo *u, int hide) return NULL; proto_name = sup_protos[i].name; proto_default_port = sup_protos[i].port; - host = CLEANDUP (u->host); - dir = CLEANDUP (u->dir); - file = CLEANDUP (u->file); + host = encode_string (u->host); + dir = encode_string (u->dir); + file = encode_string (u->file); user = passwd = NULL; if (u->user) - user = CLEANDUP (u->user); + user = encode_string (u->user); if (u->passwd) { if (hide) @@ -667,7 +692,7 @@ str_url (const struct urlinfo *u, int hide) this code, when we replaced the password characters with 'x's. */ passwd = xstrdup(""); else - passwd = CLEANDUP (u->passwd); + passwd = encode_string (u->passwd); } if (u->proto == URLFTP && *dir == '/') { @@ -974,8 +999,7 @@ mkstruct (const struct urlinfo *u) sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir); dir = newdir; } - dir = xstrdup (dir); - URL_CLEANSE (dir); + dir = encode_string (dir); l = strlen (dir); if (l && dir[l - 1] == '/') dir[l - 1] = '\0'; @@ -1078,37 +1102,46 @@ find_last_char (const char *b, const char *e, char c) return NULL; } -/* Construct a URL by concatenating an absolute URL and a path, which - may or may not be absolute. This tries to behave "reasonably" in - all foreseeable cases. It employs little specific knowledge about - protocols or URL-specific stuff -- it just works on strings. */ +/* Resolve the result of "linking" a base URI (BASE) to a + link-specified URI (LINK). + + Either of the URIs may be absolute or relative, complete with the + host name, or path only. This tries to behave "reasonably" in all + foreseeable cases. It employs little specific knowledge about + protocols or URL-specific stuff -- it just works on strings. + + The parameters LINKLENGTH is useful if LINK is not zero-terminated. + See uri_merge for a gentler interface to this functionality. + + #### This function should handle `./' and `../' so that the evil + path_simplify can go. */ static char * -construct (const char *url, const char *sub, int subsize, int no_proto) +uri_merge_1 (const char *base, const char *link, int linklength, int no_proto) { char *constr; if (no_proto) { - const char *end = url + urlpath_length (url); + const char *end = base + urlpath_length (base); - if (*sub != '/') + if (*link != '/') { - /* SUB is a relative URL: we need to replace everything - after last slash (possibly empty) with SUB. + /* LINK is a relative URL: we need to replace everything + after last slash (possibly empty) with LINK. - So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy", + So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", our result should be "whatever/foo/qux/xyzzy". */ int need_explicit_slash = 0; int span; const char *start_insert; - const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */ + const char *last_slash = find_last_char (base, end, '/'); if (!last_slash) { - /* No slash found at all. Append SUB to what we have, + /* No slash found at all. Append LINK to what we have, but we'll need a slash as a separator. - Example: if url == "foo" and sub == "qux/xyzzy", then - we cannot just append sub to url, because we'd get + Example: if base == "foo" and link == "qux/xyzzy", then + we cannot just append link to base, because we'd get "fooqux/xyzzy", whereas what we want is "foo/qux/xyzzy". @@ -1123,7 +1156,7 @@ construct (const char *url, const char *sub, int subsize, int no_proto) start_insert = end + 1; need_explicit_slash = 1; } - else if (last_slash && last_slash != url && *(last_slash - 1) == '/') + else if (last_slash && last_slash != base && *(last_slash - 1) == '/') { /* example: http://host" */ /* ^ */ @@ -1137,28 +1170,28 @@ construct (const char *url, const char *sub, int subsize, int no_proto) start_insert = last_slash + 1; } - span = start_insert - url; - constr = (char *)xmalloc (span + subsize + 1); + span = start_insert - base; + constr = (char *)xmalloc (span + linklength + 1); if (span) - memcpy (constr, url, span); + memcpy (constr, base, span); if (need_explicit_slash) constr[span - 1] = '/'; - if (subsize) - memcpy (constr + span, sub, subsize); - constr[span + subsize] = '\0'; + if (linklength) + memcpy (constr + span, link, linklength); + constr[span + linklength] = '\0'; } - else /* *sub == `/' */ + else /* *link == `/' */ { - /* SUB is an absolute path: we need to replace everything - after (and including) the FIRST slash with SUB. + /* LINK is an absolute path: we need to replace everything + after (and including) the FIRST slash with LINK. - So, if URL is "http://host/whatever/foo/bar", and SUB is + So, if BASE is "http://host/whatever/foo/bar", and LINK is "/qux/xyzzy", our result should be "http://host/qux/xyzzy". */ int span; const char *slash; const char *start_insert = NULL; /* for gcc to shut up. */ - const char *pos = url; + const char *pos = base; int seen_slash_slash = 0; /* We're looking for the first slash, but want to ignore double slash. */ @@ -1174,14 +1207,14 @@ construct (const char *url, const char *sub, int subsize, int no_proto) /* At this point, SLASH is the location of the first / after "//", or the first slash altogether. START_INSERT is the - pointer to the location where SUB will be inserted. When - examining the last two examples, keep in mind that SUB + pointer to the location where LINK will be inserted. When + examining the last two examples, keep in mind that LINK begins with '/'. */ if (!slash && !seen_slash_slash) /* example: "foo" */ /* ^ */ - start_insert = url; + start_insert = base; else if (!slash && seen_slash_slash) /* example: "http://foo" */ /* ^ */ @@ -1189,33 +1222,35 @@ construct (const char *url, const char *sub, int subsize, int no_proto) else if (slash && !seen_slash_slash) /* example: "foo/bar" */ /* ^ */ - start_insert = url; + start_insert = base; else if (slash && seen_slash_slash) /* example: "http://something/" */ /* ^ */ start_insert = slash; - span = start_insert - url; - constr = (char *)xmalloc (span + subsize + 1); + span = start_insert - base; + constr = (char *)xmalloc (span + linklength + 1); if (span) - memcpy (constr, url, span); - if (subsize) - memcpy (constr + span, sub, subsize); - constr[span + subsize] = '\0'; + memcpy (constr, base, span); + if (linklength) + memcpy (constr + span, link, linklength); + constr[span + linklength] = '\0'; } } else /* !no_proto */ { - constr = strdupdelim (sub, sub + subsize); + constr = strdupdelim (link, link + linklength); } return constr; } -/* Like the function above, but with a saner caller interface. */ +/* Merge BASE with LINK and return the resulting URI. This is an + interface to uri_merge_1 that assumes that LINK is a + zero-terminated string. */ char * -url_concat (const char *base_url, const char *new_url) +uri_merge (const char *base, const char *link) { - return construct (base_url, new_url, strlen (new_url), !has_proto (new_url)); + return uri_merge_1 (base, link, strlen (link), !has_proto (link)); } /* Optimize URL by host, destructively replacing u->host with realhost @@ -1232,32 +1267,6 @@ opt_url (struct urlinfo *u) xfree (u->url); u->url = str_url (u, 0); } - -/* This beautiful kludge is fortunately not needed, as I've made - parse_dir do the (almost) right thing, so that a query can never - become a part of directory. */ -#if 0 -/* Call path_simplify, but make sure that the part after the - question-mark, if any, is not destroyed by path_simplify's - "optimizations". */ -void -path_simplify_with_kludge (char *path) -{ - char *query = strchr (path, '?'); - if (query) - /* path_simplify also works destructively, so we also have the - license to write. */ - *query = '\0'; - path_simplify (path); - if (query) - { - char *newend = path + strlen (path); - *query = '?'; - if (newend != query) - memmove (newend, query, strlen (query) + 1); - } -} -#endif /* Returns proxy host address, in accordance with PROTO. */ char * diff --git a/src/url.h b/src/url.h index bc06fe45..a180cc6e 100644 --- a/src/url.h +++ b/src/url.h @@ -26,10 +26,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEFAULT_HTTPS_PORT 443 -/* If the string contains unsafe characters, duplicate it with - encode_string, otherwise just copy it with strdup. */ -#define CLEANDUP(x) (contains_unsafe (x) ? encode_string (x) : xstrdup (x)) - /* Structure containing info on a URL. */ struct urlinfo { @@ -97,7 +93,6 @@ typedef enum /* Function declarations */ -int contains_unsafe PARAMS ((const char *)); char *encode_string PARAMS ((const char *)); struct urlinfo *newurl PARAMS ((void)); @@ -115,7 +110,7 @@ urlpos *get_urls_file PARAMS ((const char *)); urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *)); void free_urlpos PARAMS ((urlpos *)); -char *url_concat PARAMS ((const char *, const char *)); +char *uri_merge PARAMS ((const char *, const char *)); void rotate_backups PARAMS ((const char *)); int mkalldirs PARAMS ((const char *)); diff --git a/src/wget.h b/src/wget.h index 1236ff14..0fa9765f 100644 --- a/src/wget.h +++ b/src/wget.h @@ -139,12 +139,21 @@ char *xstrdup_debug PARAMS ((const char *, const char *, int)); /* The smaller value of the two. */ #define MINVAL(x, y) ((x) < (y) ? (x) : (y)) -/* ASCII char -> HEX digit */ -#define ASC2HEXD(x) (((x) >= '0' && (x) <= '9') ? \ - ((x) - '0') : (TOUPPER(x) - 'A' + 10)) +/* Convert the ASCII character X to a hex-digit. X should be between + '0' and '9', or between 'A' and 'F', or between 'a' and 'f'. The + result is a number between 0 and 15. If X is not a hexadecimal + digit character, the result is undefined. */ +#define XCHAR_TO_XDIGIT(x) \ + (((x) >= '0' && (x) <= '9') ? \ + ((x) - '0') : (TOUPPER(x) - 'A' + 10)) -/* HEX digit -> ASCII char */ -#define HEXD2ASC(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'A')) +/* The reverse of the above: convert a HEX digit in the [0, 15] range + to an ASCII character representing it. The A-F characters are + always in upper case. */ +#define XDIGIT_TO_XCHAR(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'A')) + +/* Like XDIGIT_TO_XCHAR, but produce a lower-case char. */ +#define XDIGIT_TO_xchar(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'a')) #define ARRAY_SIZE(array) (sizeof (array) / sizeof (*(array)))