From 101f896e4791cf23ed875353c2965435117609cc Mon Sep 17 00:00:00 2001 From: hniksic Date: Fri, 19 Sep 2003 17:05:36 -0700 Subject: [PATCH] [svn] Minor fixes and cosmetic changes. (uri_merge): Get rid of uri_merge_1. (uri_merge): Merge "foo//", "bar" as "foo//bar", not "foo///bar", i.e. don't add an extra slash merely because BASE ends with two slashes. (parse_credentials): Renamed from parse_uname. Rewrittern in standard [beg, end) calling style. (url_skip_credentials): Renamed from url_skip_uname. Made static. (url_skip_credentials): Include # and ; as terminators. Old code would mistakenly consider "http://foo.com#hniksic@iskon.hr" to contain a username. (url_skip_scheme): Removed because it was unused. (url_has_scheme): Require "scheme" to be at least one char long. --- src/ChangeLog | 15 ++ src/url.c | 536 ++++++++++++++++++++++++-------------------------- src/url.h | 3 - 3 files changed, 274 insertions(+), 280 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 0a73fffc..80e9a0c0 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,18 @@ +2003-09-20 Hrvoje Niksic + + * url.c (uri_merge): Get rid of uri_merge_1. + (uri_merge): Merge "foo//", "bar" as "foo//bar", not "foo///bar", + i.e. don't add an extra slash merely because BASE ends with two + slashes. + (parse_credentials): Renamed from parse_uname. Rewrittern in + standard [beg, end) calling style. + (url_skip_credentials): Renamed from url_skip_uname. Made static. + (url_skip_credentials): Include # and ; as terminators. Old code + would mistakenly consider "http://foo.com#hniksic@iskon.hr" to + contain a username. + (url_skip_scheme): Removed because it was unused. + (url_has_scheme): Require "scheme" to be at least one char long. + 2003-09-19 Hrvoje Niksic * url.c (url_file_name): Expect NULL dir_prefix. diff --git a/src/url.c b/src/url.c index af44f9b7..0cead0fa 100644 --- a/src/url.c +++ b/src/url.c @@ -409,6 +409,7 @@ reencode_escapes (const char *s) /* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not. */ + enum url_scheme url_scheme (const char *url) { @@ -427,37 +428,25 @@ url_scheme (const char *url) return SCHEME_INVALID; } -/* Return the number of characters needed to skip the scheme part of - the URL, e.g. `http://'. If no scheme is found, returns 0. */ -int -url_skip_scheme (const char *url) -{ - const char *p = url; +#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+') - /* Skip the scheme name. We allow `-' and `+' because of `whois++', - etc. */ - while (ISALNUM (*p) || *p == '-' || *p == '+') - ++p; - if (*p != ':') - return 0; - /* Skip ':'. */ - ++p; +/* Return 1 if the URL begins with any "scheme", 0 otherwise. As + currently implemented, it returns true if URL begins with + [-+a-zA-Z0-9]+: . */ - /* Skip "//" if found. */ - if (*p == '/' && *(p + 1) == '/') - p += 2; - - return p - url; -} - -/* Returns 1 if the URL begins with a scheme (supported or - unsupported), 0 otherwise. */ int url_has_scheme (const char *url) { const char *p = url; - while (ISALNUM (*p) || *p == '-' || *p == '+') + + /* The first char must be a scheme char. */ + if (!*p || !SCHEME_CHAR (*p)) + return 0; + ++p; + /* Followed by 0 or more scheme chars. */ + while (*p && SCHEME_CHAR (*p)) ++p; + /* Terminated by ':'. */ return *p == ':'; } @@ -474,57 +463,51 @@ scheme_disable (enum url_scheme scheme) } /* Skip the username and password, if present here. The function - should be called *not* with the complete URL, but with the part + should *not* be called with the complete URL, but with the part right after the scheme. If no username and password are found, return 0. */ -int -url_skip_uname (const char *url) -{ - const char *p; - - /* Look for '@' that comes before '/' or '?'. */ - p = (const char *)strpbrk (url, "/?@"); - if (!p || *p != '@') - return 0; - - return p - url + 1; -} static int -parse_uname (const char *str, int len, char **user, char **passwd) +url_skip_credentials (const char *url) +{ + /* Look for '@' that comes before terminators, such as '/', '?', + '#', or ';'. */ + const char *p = (const char *)strpbrk (url, "@/?#;"); + if (!p || *p != '@') + return 0; + return p + 1 - url; +} + +/* Parse credentials contained in [BEG, END). The region is expected + to have come from a URL and is unescaped. */ + +static int +parse_credentials (const char *beg, const char *end, char **user, char **passwd) { char *colon; + const char *userend; - if (len == 0) - /* Empty user name not allowed. */ - return 0; + if (beg == end) + return 0; /* empty user name */ - colon = memchr (str, ':', len); - if (colon == str) - /* Empty user name again. */ - return 0; + colon = memchr (beg, ':', end - beg); + if (colon == beg) + return 0; /* again empty user name */ if (colon) { - int pwlen = len - (colon + 1 - str); - *passwd = xmalloc (pwlen + 1); - memcpy (*passwd, colon + 1, pwlen); - (*passwd)[pwlen] = '\0'; - len -= pwlen + 1; + *passwd = strdupdelim (colon + 1, end); + userend = colon; + url_unescape (*passwd); } else - *passwd = NULL; - - *user = xmalloc (len + 1); - memcpy (*user, str, len); - (*user)[len] = '\0'; - - if (*user) - url_unescape (*user); - if (*passwd) - url_unescape (*passwd); - + { + *passwd = NULL; + userend = end; + } + *user = strdupdelim (beg, userend); + url_unescape (*user); return 1; } @@ -540,6 +523,7 @@ parse_uname (const char *str, int len, char **user, char **passwd) foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file If the URL needs not or cannot be rewritten, return NULL. */ + char * rewrite_shorthand_url (const char *url) { @@ -586,7 +570,7 @@ rewrite_shorthand_url (const char *url) } } -static void parse_path PARAMS ((const char *, char **, char **)); +static void split_path PARAMS ((const char *, char **, char **)); /* Like strpbrk, with the exception that it returns the pointer to the terminating zero (end-of-string aka "eos") if no matching character @@ -835,7 +819,7 @@ url_parse (const char *url, int *error) p += strlen (supported_schemes[scheme].leading_string); uname_b = p; - p += url_skip_uname (p); + p += url_skip_credentials (p); uname_e = p; /* scheme://user:pass@host[:port]... */ @@ -979,7 +963,7 @@ url_parse (const char *url, int *error) /* http://user:pass@host */ /* ^ ^ */ /* uname_b uname_e */ - if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd)) + if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) { SETERR (error, PE_INVALID_USER_NAME); return NULL; @@ -997,7 +981,7 @@ url_parse (const char *url, int *error) u->path = strdupdelim (path_b, path_e); path_modified = path_simplify (u->path); - parse_path (u->path, &u->dir, &u->file); + split_path (u->path, &u->dir, &u->file); host_modified = lowercase_str (u->host); @@ -1037,15 +1021,25 @@ url_error (int error_code) return parse_errors[error_code]; } -/* Parse PATH into dir and file. PATH is extracted from the URL and - is URL-escaped. The function returns unescaped DIR and FILE. */ +/* Split PATH into DIR and FILE. PATH comes from the URL and is + expected to be URL-escaped. + + The path is split into directory (the part up to the last slash) + and file (the part after the last slash), which are subsequently + unescaped. Examples: + + PATH DIR FILE + "foo/bar/baz" "foo/bar" "baz" + "foo/bar/" "foo/bar" "" + "foo" "" "foo" + "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!) + + DIR and FILE are freshly allocated. */ static void -parse_path (const char *path, char **dir, char **file) +split_path (const char *path, char **dir, char **file) { - char *last_slash; - - last_slash = strrchr (path, '/'); + char *last_slash = strrchr (path, '/'); if (!last_slash) { *dir = xstrdup (""); @@ -1865,221 +1859,209 @@ path_simplify (char *path) return change; } -/* Resolve the result of "linking" a base URI (BASE) to a - link-specified URI (LINK). +/* Merge BASE with LINK and return the resulting URI. Either of the URIs may be absolute or relative, complete with the - host name, or path only. This tries to behave "reasonably" in all - foreseeable cases. It employs little specific knowledge about - schemes or URL-specific stuff -- it just works on strings. - - The parameters LINKLENGTH is useful if LINK is not zero-terminated. - See uri_merge for a gentler interface to this functionality. + host name, or path only. This tries to reasonably handle all + foreseeable cases. It only employs minimal URL parsing, without + knowledge of the specifics of schemes. Perhaps this function should call path_simplify so that the callers don't have to call url_parse unconditionally. */ -static char * -uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme) -{ - char *constr; - if (no_scheme) - { - const char *end = base + path_length (base); - - if (!*link) - { - /* Empty LINK points back to BASE, query string and all. */ - constr = xstrdup (base); - } - else if (*link == '?') - { - /* LINK points to the same location, but changes the query - string. Examples: */ - /* uri_merge("path", "?new") -> "path?new" */ - /* uri_merge("path?foo", "?new") -> "path?new" */ - /* uri_merge("path?foo#bar", "?new") -> "path?new" */ - /* uri_merge("path#foo", "?new") -> "path?new" */ - int baselength = end - base; - constr = xmalloc (baselength + linklength + 1); - memcpy (constr, base, baselength); - memcpy (constr + baselength, link, linklength); - constr[baselength + linklength] = '\0'; - } - else if (*link == '#') - { - /* uri_merge("path", "#new") -> "path#new" */ - /* uri_merge("path#foo", "#new") -> "path#new" */ - /* uri_merge("path?foo", "#new") -> "path?foo#new" */ - /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ - int baselength; - const char *end1 = strchr (base, '#'); - if (!end1) - end1 = base + strlen (base); - baselength = end1 - base; - constr = xmalloc (baselength + linklength + 1); - memcpy (constr, base, baselength); - memcpy (constr + baselength, link, linklength); - constr[baselength + linklength] = '\0'; - } - else if (linklength > 1 && *link == '/' && *(link + 1) == '/') - { - /* LINK begins with "//" and so is a net path: we need to - replace everything after (and including) the double slash - with LINK. */ - - /* uri_merge("foo", "//new/bar") -> "//new/bar" */ - /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ - /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ - - int span; - const char *slash; - const char *start_insert; - - /* Look for first slash. */ - slash = memchr (base, '/', end - base); - /* If found slash and it is a double slash, then replace - from this point, else default to replacing from the - beginning. */ - if (slash && *(slash + 1) == '/') - start_insert = slash; - else - start_insert = base; - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; - } - else if (*link == '/') - { - /* LINK is an absolute path: we need to replace everything - after (and including) the FIRST slash with LINK. - - So, if BASE is "http://host/whatever/foo/bar", and LINK is - "/qux/xyzzy", our result should be - "http://host/qux/xyzzy". */ - int span; - const char *slash; - const char *start_insert = NULL; /* for gcc to shut up. */ - const char *pos = base; - int seen_slash_slash = 0; - /* We're looking for the first slash, but want to ignore - double slash. */ - again: - slash = memchr (pos, '/', end - pos); - if (slash && !seen_slash_slash) - if (*(slash + 1) == '/') - { - pos = slash + 2; - seen_slash_slash = 1; - goto again; - } - - /* At this point, SLASH is the location of the first / after - "//", or the first slash altogether. START_INSERT is the - pointer to the location where LINK will be inserted. When - examining the last two examples, keep in mind that LINK - begins with '/'. */ - - if (!slash && !seen_slash_slash) - /* example: "foo" */ - /* ^ */ - start_insert = base; - else if (!slash && seen_slash_slash) - /* example: "http://foo" */ - /* ^ */ - start_insert = end; - else if (slash && !seen_slash_slash) - /* example: "foo/bar" */ - /* ^ */ - start_insert = base; - else if (slash && seen_slash_slash) - /* example: "http://something/" */ - /* ^ */ - start_insert = slash; - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - if (linklength) - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; - } - else - { - /* LINK is a relative URL: we need to replace everything - after last slash (possibly empty) with LINK. - - So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", - our result should be "whatever/foo/qux/xyzzy". */ - int need_explicit_slash = 0; - int span; - const char *start_insert; - const char *last_slash = find_last_char (base, end, '/'); - if (!last_slash) - { - /* No slash found at all. Append LINK to what we have, - but we'll need a slash as a separator. - - Example: if base == "foo" and link == "qux/xyzzy", then - we cannot just append link to base, because we'd get - "fooqux/xyzzy", whereas what we want is - "foo/qux/xyzzy". - - To make sure the / gets inserted, we set - need_explicit_slash to 1. We also set start_insert - to end + 1, so that the length calculations work out - correctly for one more (slash) character. Accessing - that character is fine, since it will be the - delimiter, '\0' or '?'. */ - /* example: "foo?..." */ - /* ^ ('?' gets changed to '/') */ - start_insert = end + 1; - need_explicit_slash = 1; - } - else if (last_slash && last_slash != base && *(last_slash - 1) == '/') - { - /* example: http://host" */ - /* ^ */ - start_insert = end + 1; - need_explicit_slash = 1; - } - else - { - /* example: "whatever/foo/bar" */ - /* ^ */ - start_insert = last_slash + 1; - } - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - if (need_explicit_slash) - constr[span - 1] = '/'; - if (linklength) - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; - } - } - else /* !no_scheme */ - { - constr = strdupdelim (link, link + linklength); - } - return constr; -} - -/* Merge BASE with LINK and return the resulting URI. This is an - interface to uri_merge_1 that assumes that LINK is a - zero-terminated string. */ char * uri_merge (const char *base, const char *link) { - return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link)); + int linklength; + const char *end; + char *merge; + + if (url_has_scheme (link)) + return xstrdup (link); + + /* We may not examine BASE past END. */ + end = base + path_length (base); + linklength = strlen (link); + + if (!*link) + { + /* Empty LINK points back to BASE, query string and all. */ + return xstrdup (base); + } + else if (*link == '?') + { + /* LINK points to the same location, but changes the query + string. Examples: */ + /* uri_merge("path", "?new") -> "path?new" */ + /* uri_merge("path?foo", "?new") -> "path?new" */ + /* uri_merge("path?foo#bar", "?new") -> "path?new" */ + /* uri_merge("path#foo", "?new") -> "path?new" */ + int baselength = end - base; + merge = xmalloc (baselength + linklength + 1); + memcpy (merge, base, baselength); + memcpy (merge + baselength, link, linklength); + merge[baselength + linklength] = '\0'; + } + else if (*link == '#') + { + /* uri_merge("path", "#new") -> "path#new" */ + /* uri_merge("path#foo", "#new") -> "path#new" */ + /* uri_merge("path?foo", "#new") -> "path?foo#new" */ + /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ + int baselength; + const char *end1 = strchr (base, '#'); + if (!end1) + end1 = base + strlen (base); + baselength = end1 - base; + merge = xmalloc (baselength + linklength + 1); + memcpy (merge, base, baselength); + memcpy (merge + baselength, link, linklength); + merge[baselength + linklength] = '\0'; + } + else if (*link == '/' && *(link + 1) == '/') + { + /* LINK begins with "//" and so is a net path: we need to + replace everything after (and including) the double slash + with LINK. */ + + /* uri_merge("foo", "//new/bar") -> "//new/bar" */ + /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ + /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ + + int span; + const char *slash; + const char *start_insert; + + /* Look for first slash. */ + slash = memchr (base, '/', end - base); + /* If found slash and it is a double slash, then replace + from this point, else default to replacing from the + beginning. */ + if (slash && *(slash + 1) == '/') + start_insert = slash; + else + start_insert = base; + + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; + } + else if (*link == '/') + { + /* LINK is an absolute path: we need to replace everything + after (and including) the FIRST slash with LINK. + + So, if BASE is "http://host/whatever/foo/bar", and LINK is + "/qux/xyzzy", our result should be + "http://host/qux/xyzzy". */ + int span; + const char *slash; + const char *start_insert = NULL; /* for gcc to shut up. */ + const char *pos = base; + int seen_slash_slash = 0; + /* We're looking for the first slash, but want to ignore + double slash. */ + again: + slash = memchr (pos, '/', end - pos); + if (slash && !seen_slash_slash) + if (*(slash + 1) == '/') + { + pos = slash + 2; + seen_slash_slash = 1; + goto again; + } + + /* At this point, SLASH is the location of the first / after + "//", or the first slash altogether. START_INSERT is the + pointer to the location where LINK will be inserted. When + examining the last two examples, keep in mind that LINK + begins with '/'. */ + + if (!slash && !seen_slash_slash) + /* example: "foo" */ + /* ^ */ + start_insert = base; + else if (!slash && seen_slash_slash) + /* example: "http://foo" */ + /* ^ */ + start_insert = end; + else if (slash && !seen_slash_slash) + /* example: "foo/bar" */ + /* ^ */ + start_insert = base; + else if (slash && seen_slash_slash) + /* example: "http://something/" */ + /* ^ */ + start_insert = slash; + + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; + } + else + { + /* LINK is a relative URL: we need to replace everything + after last slash (possibly empty) with LINK. + + So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", + our result should be "whatever/foo/qux/xyzzy". */ + int need_explicit_slash = 0; + int span; + const char *start_insert; + const char *last_slash = find_last_char (base, end, '/'); + if (!last_slash) + { + /* No slash found at all. Append LINK to what we have, + but we'll need a slash as a separator. + + Example: if base == "foo" and link == "qux/xyzzy", then + we cannot just append link to base, because we'd get + "fooqux/xyzzy", whereas what we want is + "foo/qux/xyzzy". + + To make sure the / gets inserted, we set + need_explicit_slash to 1. We also set start_insert + to end + 1, so that the length calculations work out + correctly for one more (slash) character. Accessing + that character is fine, since it will be the + delimiter, '\0' or '?'. */ + /* example: "foo?..." */ + /* ^ ('?' gets changed to '/') */ + start_insert = end + 1; + need_explicit_slash = 1; + } + else if (last_slash && last_slash >= base + 2 + && last_slash[-2] == ':' && last_slash[-1] == '/') + { + /* example: http://host" */ + /* ^ */ + start_insert = end + 1; + need_explicit_slash = 1; + } + else + { + /* example: "whatever/foo/bar" */ + /* ^ */ + start_insert = last_slash + 1; + } + + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + if (need_explicit_slash) + merge[span - 1] = '/'; + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; + } + + return merge; } #define APPEND(p, s) do { \ diff --git a/src/url.h b/src/url.h index d80fe54d..5cf9e4f6 100644 --- a/src/url.h +++ b/src/url.h @@ -140,13 +140,10 @@ void url_set_file PARAMS ((struct url *, const char *)); void url_free PARAMS ((struct url *)); enum url_scheme url_scheme PARAMS ((const char *)); -int url_skip_scheme PARAMS ((const char *)); int url_has_scheme PARAMS ((const char *)); int scheme_default_port PARAMS ((enum url_scheme)); void scheme_disable PARAMS ((enum url_scheme)); -int url_skip_uname PARAMS ((const char *)); - char *url_string PARAMS ((const struct url *, int)); struct urlpos *get_urls_file PARAMS ((const char *));