1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Committed my patch from <sxsy9z4xz5m.fsf@florida.arsdigita.de>

(recognize HTML entities.)
This commit is contained in:
hniksic 2000-10-31 17:25:12 -08:00
parent 846b045a69
commit 515d82fb95
4 changed files with 121 additions and 17 deletions

View File

@ -1,3 +1,12 @@
2000-11-01 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (get_urls_html): Decode HTML entities using
html_decode_entities.
* html.c (htmlfindurl): Don't count the `#' in numeric entities
(&#NNN;) as an HTML fragemnt.
(html_decode_entities): New function.
2000-11-01 Hrvoje Niksic <hniksic@arsdigita.com>
* html.c (htmlfindurl): Fix recognition of # HTML fragments.

View File

@ -91,7 +91,6 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr)
return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
}
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
describing URLs to follow. When a tag is encountered, extract its
components (as described by html_allow[] array), and return the
@ -270,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
for (++buf, --bufsize;
bufsize && *buf != s->quote_char && *buf != '\n';
++buf, --bufsize)
if (!ph && *buf == '#')
if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
{
@ -294,7 +293,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
p = buf;
for (; bufsize && !ISSPACE (*buf) && *buf != '>';
++buf, --bufsize)
if (!ph && *buf == '#')
if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
break;
@ -437,6 +436,83 @@ html_base (void)
return global_state.base;
}
/* Create a malloc'ed copy of text in the range [beg, end), but with
the HTML entities processed. Recognized entities are &lt, &gt,
&amp, &quot, &nbsp and the numerical entities. */
char *
html_decode_entities (const char *beg, const char *end)
{
char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
const char *from = beg;
char *to = newstr;
while (from < end)
{
if (*from != '&')
*to++ = *from++;
else
{
const char *save = from;
int remain;
if (++from == end) goto lose;
remain = end - from;
if (*from == '#')
{
int numeric;
++from;
if (from == end || !ISDIGIT (*from)) goto lose;
for (numeric = 0; from < end && ISDIGIT (*from); from++)
numeric = 10 * numeric + (*from) - '0';
if (from < end && ISALPHA (*from)) goto lose;
numeric &= 0xff;
*to++ = numeric;
}
#define FROB(literal) (remain >= (sizeof (literal) - 1) \
&& !memcmp (from, literal, sizeof (literal) - 1) \
&& (*(from + sizeof (literal) - 1) == ';' \
|| remain == sizeof (literal) - 1 \
|| !ISALNUM (*(from + sizeof (literal) - 1))))
else if (FROB ("lt"))
*to++ = '<', from += 2;
else if (FROB ("gt"))
*to++ = '>', from += 2;
else if (FROB ("amp"))
*to++ = '&', from += 3;
else if (FROB ("quot"))
*to++ = '\"', from += 4;
/* We don't implement the "Added Latin 1" entities proposed
by rfc1866 (except for nbsp), because it is unnecessary
in the context of Wget, and would require hashing to work
efficiently. */
else if (FROB ("nbsp"))
*to++ = 160, from += 4;
else
goto lose;
#undef FROB
/* If the entity was followed by `;', we step over the `;'.
Otherwise, it was followed by either a non-alphanumeric
or EOB, in which case we do nothing. */
if (from < end && *from == ';')
++from;
continue;
lose:
/* This was not an entity after all. Back out. */
from = save;
*to++ = *from++;
}
}
*to++ = '\0';
/* #### Should we try to do this: */
#if 0
newstr = xrealloc (newstr, to - newstr);
#endif
return newstr;
}
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
entities, as per RFC1866:

View File

@ -34,6 +34,7 @@ struct fileinfo;
/* Function declarations */
const char *htmlfindurl PARAMS ((const char *, int, int *, int, int));
const char *html_base PARAMS ((void));
char *html_decode_entities PARAMS ((const char *, const char *));
uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *));
#endif /* HTML_H */

View File

@ -869,6 +869,7 @@ get_urls_html (const char *file, const char *this_url, int silent,
const char *pbuf = buf;
char *constr, *base;
const char *cbase;
char *needs_freeing, *url_data;
first_time = 0;
@ -889,16 +890,27 @@ get_urls_html (const char *file, const char *this_url, int silent,
if (!size)
break;
/* It would be nice if we could avoid allocating memory in this
loop, but I don't see an easy way. To process the entities,
we need to either copy the data, or change it destructively.
I choose the former.
We have two pointers: needs_freeing and url_data, because the
code below does thing like url_data += <something>, and we
want to pass the original string to free(). */
needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
size = strlen (url_data);
for (i = 0; protostrings[i]; i++)
{
if (!strncasecmp (protostrings[i], pbuf,
if (!strncasecmp (protostrings[i], url_data,
MINVAL (strlen (protostrings[i]), size)))
break;
}
/* Check for http:RELATIVE_URI. See below for details. */
if (protostrings[i]
&& !(strncasecmp (pbuf, "http:", 5) == 0
&& strncasecmp (pbuf, "http://", 7) != 0))
&& !(strncasecmp (url_data, "http:", 5) == 0
&& strncasecmp (url_data, "http://", 7) != 0))
{
no_proto = 0;
}
@ -909,21 +921,24 @@ get_urls_html (const char *file, const char *this_url, int silent,
relative URI-s as <a href="http:URL">. Just strip off the
silly leading "http:" (as well as any leading blanks
before it). */
if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
pbuf += 5, size -= 5;
if ((size > 5) && !strncasecmp ("http:", url_data, 5))
url_data += 5, size -= 5;
}
if (!no_proto)
{
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
{
if (!strncasecmp (sup_protos[i].name, pbuf,
if (!strncasecmp (sup_protos[i].name, url_data,
MINVAL (strlen (sup_protos[i].name), size)))
break;
}
/* Do *not* accept a non-supported protocol. */
if (i == ARRAY_SIZE (sup_protos))
{
free (needs_freeing);
continue;
}
}
if (no_proto)
{
/* First, construct the base, which can be relative itself.
@ -945,13 +960,14 @@ get_urls_html (const char *file, const char *this_url, int silent,
/* Use malloc, not alloca because this is called in
a loop. */
char *temp = (char *)malloc (size + 1);
strncpy (temp, pbuf, size);
strncpy (temp, url_data, size);
temp[size] = '\0';
logprintf (LOG_NOTQUIET,
_("Error (%s): Link %s without a base provided.\n"),
file, temp);
free (temp);
}
free (needs_freeing);
continue;
}
if (this_url)
@ -966,17 +982,18 @@ get_urls_html (const char *file, const char *this_url, int silent,
logprintf (LOG_NOTQUIET, _("\
Error (%s): Base %s relative, without referer URL.\n"),
file, cbase);
free (needs_freeing);
continue;
}
base = xstrdup (cbase);
}
constr = construct (base, pbuf, size, no_proto);
constr = construct (base, url_data, size, no_proto);
free (base);
}
else /* has proto */
{
constr = (char *)xmalloc (size + 1);
strncpy (constr, pbuf, size);
strncpy (constr, url_data, size);
constr[size] = '\0';
}
#ifdef DEBUG
@ -988,7 +1005,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
tmp2 = html_base ();
/* Use malloc, not alloca because this is called in a loop. */
tmp = (char *)xmalloc (size + 1);
strncpy (tmp, pbuf, size);
strncpy (tmp, url_data, size);
tmp[size] = '\0';
logprintf (LOG_ALWAYS,
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
@ -1009,14 +1026,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
memset (current, 0, sizeof (*current));
current->next = NULL;
current->url = constr;
current->size = size;
current->pos = pbuf - orig_buf;
current->size = step;
current->pos = buf - orig_buf;
/* A URL is relative if the host and protocol are not named,
and the name does not start with `/'. */
if (no_proto && *pbuf != '/')
if (no_proto && *url_data != '/')
current->flags |= (URELATIVE | UNOPROTO);
else if (no_proto)
current->flags |= UNOPROTO;
free (needs_freeing);
}
free (orig_buf);