diff --git a/src/ChangeLog b/src/ChangeLog index 88c45013..f712bacc 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2000-11-01 Hrvoje Niksic + + * url.c (get_urls_html): Decode HTML entities using + html_decode_entities. + + * html.c (htmlfindurl): Don't count the `#' in numeric entities + (&#NNN;) as an HTML fragemnt. + (html_decode_entities): New function. + 2000-11-01 Hrvoje Niksic * html.c (htmlfindurl): Fix recognition of # HTML fragments. diff --git a/src/html.c b/src/html.c index ace0e31b..7d990505 100644 --- a/src/html.c +++ b/src/html.c @@ -91,7 +91,6 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr) return FALSE; /* not one of the tag/attribute pairs wget ever cares about */ } - /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags describing URLs to follow. When a tag is encountered, extract its components (as described by html_allow[] array), and return the @@ -270,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init, for (++buf, --bufsize; bufsize && *buf != s->quote_char && *buf != '\n'; ++buf, --bufsize) - if (!ph && *buf == '#') + if (!ph && *buf == '#' && *(buf - 1) != '&') ph = buf; if (!bufsize) { @@ -294,7 +293,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init, p = buf; for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize) - if (!ph && *buf == '#') + if (!ph && *buf == '#' && *(buf - 1) != '&') ph = buf; if (!bufsize) break; @@ -437,6 +436,83 @@ html_base (void) return global_state.base; } +/* Create a malloc'ed copy of text in the range [beg, end), but with + the HTML entities processed. Recognized entities are <, >, + &, ",   and the numerical entities. */ + +char * +html_decode_entities (const char *beg, const char *end) +{ + char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */ + const char *from = beg; + char *to = newstr; + + while (from < end) + { + if (*from != '&') + *to++ = *from++; + else + { + const char *save = from; + int remain; + + if (++from == end) goto lose; + remain = end - from; + + if (*from == '#') + { + int numeric; + ++from; + if (from == end || !ISDIGIT (*from)) goto lose; + for (numeric = 0; from < end && ISDIGIT (*from); from++) + numeric = 10 * numeric + (*from) - '0'; + if (from < end && ISALPHA (*from)) goto lose; + numeric &= 0xff; + *to++ = numeric; + } +#define FROB(literal) (remain >= (sizeof (literal) - 1) \ + && !memcmp (from, literal, sizeof (literal) - 1) \ + && (*(from + sizeof (literal) - 1) == ';' \ + || remain == sizeof (literal) - 1 \ + || !ISALNUM (*(from + sizeof (literal) - 1)))) + else if (FROB ("lt")) + *to++ = '<', from += 2; + else if (FROB ("gt")) + *to++ = '>', from += 2; + else if (FROB ("amp")) + *to++ = '&', from += 3; + else if (FROB ("quot")) + *to++ = '\"', from += 4; + /* We don't implement the "Added Latin 1" entities proposed + by rfc1866 (except for nbsp), because it is unnecessary + in the context of Wget, and would require hashing to work + efficiently. */ + else if (FROB ("nbsp")) + *to++ = 160, from += 4; + else + goto lose; +#undef FROB + /* If the entity was followed by `;', we step over the `;'. + Otherwise, it was followed by either a non-alphanumeric + or EOB, in which case we do nothing. */ + if (from < end && *from == ';') + ++from; + continue; + + lose: + /* This was not an entity after all. Back out. */ + from = save; + *to++ = *from++; + } + } + *to++ = '\0'; + /* #### Should we try to do this: */ +#if 0 + newstr = xrealloc (newstr, to - newstr); +#endif + return newstr; +} + /* The function returns the pointer to the malloc-ed quoted version of string s. It will recognize and quote numeric and special graphic entities, as per RFC1866: diff --git a/src/html.h b/src/html.h index 7fa0132e..824b6ca9 100644 --- a/src/html.h +++ b/src/html.h @@ -34,6 +34,7 @@ struct fileinfo; /* Function declarations */ const char *htmlfindurl PARAMS ((const char *, int, int *, int, int)); const char *html_base PARAMS ((void)); +char *html_decode_entities PARAMS ((const char *, const char *)); uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *)); #endif /* HTML_H */ diff --git a/src/url.c b/src/url.c index d68f1851..0a9fa4da 100644 --- a/src/url.c +++ b/src/url.c @@ -869,6 +869,7 @@ get_urls_html (const char *file, const char *this_url, int silent, const char *pbuf = buf; char *constr, *base; const char *cbase; + char *needs_freeing, *url_data; first_time = 0; @@ -889,16 +890,27 @@ get_urls_html (const char *file, const char *this_url, int silent, if (!size) break; + /* It would be nice if we could avoid allocating memory in this + loop, but I don't see an easy way. To process the entities, + we need to either copy the data, or change it destructively. + I choose the former. + + We have two pointers: needs_freeing and url_data, because the + code below does thing like url_data += , and we + want to pass the original string to free(). */ + needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size); + size = strlen (url_data); + for (i = 0; protostrings[i]; i++) { - if (!strncasecmp (protostrings[i], pbuf, + if (!strncasecmp (protostrings[i], url_data, MINVAL (strlen (protostrings[i]), size))) break; } /* Check for http:RELATIVE_URI. See below for details. */ if (protostrings[i] - && !(strncasecmp (pbuf, "http:", 5) == 0 - && strncasecmp (pbuf, "http://", 7) != 0)) + && !(strncasecmp (url_data, "http:", 5) == 0 + && strncasecmp (url_data, "http://", 7) != 0)) { no_proto = 0; } @@ -909,20 +921,23 @@ get_urls_html (const char *file, const char *this_url, int silent, relative URI-s as . Just strip off the silly leading "http:" (as well as any leading blanks before it). */ - if ((size > 5) && !strncasecmp ("http:", pbuf, 5)) - pbuf += 5, size -= 5; + if ((size > 5) && !strncasecmp ("http:", url_data, 5)) + url_data += 5, size -= 5; } if (!no_proto) { for (i = 0; i < ARRAY_SIZE (sup_protos); i++) { - if (!strncasecmp (sup_protos[i].name, pbuf, + if (!strncasecmp (sup_protos[i].name, url_data, MINVAL (strlen (sup_protos[i].name), size))) break; } /* Do *not* accept a non-supported protocol. */ if (i == ARRAY_SIZE (sup_protos)) - continue; + { + free (needs_freeing); + continue; + } } if (no_proto) { @@ -945,13 +960,14 @@ get_urls_html (const char *file, const char *this_url, int silent, /* Use malloc, not alloca because this is called in a loop. */ char *temp = (char *)malloc (size + 1); - strncpy (temp, pbuf, size); + strncpy (temp, url_data, size); temp[size] = '\0'; logprintf (LOG_NOTQUIET, _("Error (%s): Link %s without a base provided.\n"), file, temp); free (temp); } + free (needs_freeing); continue; } if (this_url) @@ -966,17 +982,18 @@ get_urls_html (const char *file, const char *this_url, int silent, logprintf (LOG_NOTQUIET, _("\ Error (%s): Base %s relative, without referer URL.\n"), file, cbase); + free (needs_freeing); continue; } base = xstrdup (cbase); } - constr = construct (base, pbuf, size, no_proto); + constr = construct (base, url_data, size, no_proto); free (base); } else /* has proto */ { constr = (char *)xmalloc (size + 1); - strncpy (constr, pbuf, size); + strncpy (constr, url_data, size); constr[size] = '\0'; } #ifdef DEBUG @@ -988,7 +1005,7 @@ Error (%s): Base %s relative, without referer URL.\n"), tmp2 = html_base (); /* Use malloc, not alloca because this is called in a loop. */ tmp = (char *)xmalloc (size + 1); - strncpy (tmp, pbuf, size); + strncpy (tmp, url_data, size); tmp[size] = '\0'; logprintf (LOG_ALWAYS, "file %s; this_url %s; base %s\nlink: %s; constr: %s\n", @@ -1009,14 +1026,15 @@ Error (%s): Base %s relative, without referer URL.\n"), memset (current, 0, sizeof (*current)); current->next = NULL; current->url = constr; - current->size = size; - current->pos = pbuf - orig_buf; + current->size = step; + current->pos = buf - orig_buf; /* A URL is relative if the host and protocol are not named, and the name does not start with `/'. */ - if (no_proto && *pbuf != '/') + if (no_proto && *url_data != '/') current->flags |= (URELATIVE | UNOPROTO); else if (no_proto) current->flags |= UNOPROTO; + free (needs_freeing); } free (orig_buf);