mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Committed my patch from <sxsy9z4xz5m.fsf@florida.arsdigita.de>
(recognize HTML entities.)
This commit is contained in:
parent
846b045a69
commit
515d82fb95
@ -1,3 +1,12 @@
|
|||||||
|
2000-11-01 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||||
|
|
||||||
|
* url.c (get_urls_html): Decode HTML entities using
|
||||||
|
html_decode_entities.
|
||||||
|
|
||||||
|
* html.c (htmlfindurl): Don't count the `#' in numeric entities
|
||||||
|
(&#NNN;) as an HTML fragemnt.
|
||||||
|
(html_decode_entities): New function.
|
||||||
|
|
||||||
2000-11-01 Hrvoje Niksic <hniksic@arsdigita.com>
|
2000-11-01 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||||
|
|
||||||
* html.c (htmlfindurl): Fix recognition of # HTML fragments.
|
* html.c (htmlfindurl): Fix recognition of # HTML fragments.
|
||||||
|
82
src/html.c
82
src/html.c
@ -91,7 +91,6 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr)
|
|||||||
return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
|
return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
|
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
|
||||||
describing URLs to follow. When a tag is encountered, extract its
|
describing URLs to follow. When a tag is encountered, extract its
|
||||||
components (as described by html_allow[] array), and return the
|
components (as described by html_allow[] array), and return the
|
||||||
@ -270,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
|
|||||||
for (++buf, --bufsize;
|
for (++buf, --bufsize;
|
||||||
bufsize && *buf != s->quote_char && *buf != '\n';
|
bufsize && *buf != s->quote_char && *buf != '\n';
|
||||||
++buf, --bufsize)
|
++buf, --bufsize)
|
||||||
if (!ph && *buf == '#')
|
if (!ph && *buf == '#' && *(buf - 1) != '&')
|
||||||
ph = buf;
|
ph = buf;
|
||||||
if (!bufsize)
|
if (!bufsize)
|
||||||
{
|
{
|
||||||
@ -294,7 +293,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
|
|||||||
p = buf;
|
p = buf;
|
||||||
for (; bufsize && !ISSPACE (*buf) && *buf != '>';
|
for (; bufsize && !ISSPACE (*buf) && *buf != '>';
|
||||||
++buf, --bufsize)
|
++buf, --bufsize)
|
||||||
if (!ph && *buf == '#')
|
if (!ph && *buf == '#' && *(buf - 1) != '&')
|
||||||
ph = buf;
|
ph = buf;
|
||||||
if (!bufsize)
|
if (!bufsize)
|
||||||
break;
|
break;
|
||||||
@ -437,6 +436,83 @@ html_base (void)
|
|||||||
return global_state.base;
|
return global_state.base;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Create a malloc'ed copy of text in the range [beg, end), but with
|
||||||
|
the HTML entities processed. Recognized entities are <, >,
|
||||||
|
&, ",   and the numerical entities. */
|
||||||
|
|
||||||
|
char *
|
||||||
|
html_decode_entities (const char *beg, const char *end)
|
||||||
|
{
|
||||||
|
char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
|
||||||
|
const char *from = beg;
|
||||||
|
char *to = newstr;
|
||||||
|
|
||||||
|
while (from < end)
|
||||||
|
{
|
||||||
|
if (*from != '&')
|
||||||
|
*to++ = *from++;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const char *save = from;
|
||||||
|
int remain;
|
||||||
|
|
||||||
|
if (++from == end) goto lose;
|
||||||
|
remain = end - from;
|
||||||
|
|
||||||
|
if (*from == '#')
|
||||||
|
{
|
||||||
|
int numeric;
|
||||||
|
++from;
|
||||||
|
if (from == end || !ISDIGIT (*from)) goto lose;
|
||||||
|
for (numeric = 0; from < end && ISDIGIT (*from); from++)
|
||||||
|
numeric = 10 * numeric + (*from) - '0';
|
||||||
|
if (from < end && ISALPHA (*from)) goto lose;
|
||||||
|
numeric &= 0xff;
|
||||||
|
*to++ = numeric;
|
||||||
|
}
|
||||||
|
#define FROB(literal) (remain >= (sizeof (literal) - 1) \
|
||||||
|
&& !memcmp (from, literal, sizeof (literal) - 1) \
|
||||||
|
&& (*(from + sizeof (literal) - 1) == ';' \
|
||||||
|
|| remain == sizeof (literal) - 1 \
|
||||||
|
|| !ISALNUM (*(from + sizeof (literal) - 1))))
|
||||||
|
else if (FROB ("lt"))
|
||||||
|
*to++ = '<', from += 2;
|
||||||
|
else if (FROB ("gt"))
|
||||||
|
*to++ = '>', from += 2;
|
||||||
|
else if (FROB ("amp"))
|
||||||
|
*to++ = '&', from += 3;
|
||||||
|
else if (FROB ("quot"))
|
||||||
|
*to++ = '\"', from += 4;
|
||||||
|
/* We don't implement the "Added Latin 1" entities proposed
|
||||||
|
by rfc1866 (except for nbsp), because it is unnecessary
|
||||||
|
in the context of Wget, and would require hashing to work
|
||||||
|
efficiently. */
|
||||||
|
else if (FROB ("nbsp"))
|
||||||
|
*to++ = 160, from += 4;
|
||||||
|
else
|
||||||
|
goto lose;
|
||||||
|
#undef FROB
|
||||||
|
/* If the entity was followed by `;', we step over the `;'.
|
||||||
|
Otherwise, it was followed by either a non-alphanumeric
|
||||||
|
or EOB, in which case we do nothing. */
|
||||||
|
if (from < end && *from == ';')
|
||||||
|
++from;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
lose:
|
||||||
|
/* This was not an entity after all. Back out. */
|
||||||
|
from = save;
|
||||||
|
*to++ = *from++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*to++ = '\0';
|
||||||
|
/* #### Should we try to do this: */
|
||||||
|
#if 0
|
||||||
|
newstr = xrealloc (newstr, to - newstr);
|
||||||
|
#endif
|
||||||
|
return newstr;
|
||||||
|
}
|
||||||
|
|
||||||
/* The function returns the pointer to the malloc-ed quoted version of
|
/* The function returns the pointer to the malloc-ed quoted version of
|
||||||
string s. It will recognize and quote numeric and special graphic
|
string s. It will recognize and quote numeric and special graphic
|
||||||
entities, as per RFC1866:
|
entities, as per RFC1866:
|
||||||
|
@ -34,6 +34,7 @@ struct fileinfo;
|
|||||||
/* Function declarations */
|
/* Function declarations */
|
||||||
const char *htmlfindurl PARAMS ((const char *, int, int *, int, int));
|
const char *htmlfindurl PARAMS ((const char *, int, int *, int, int));
|
||||||
const char *html_base PARAMS ((void));
|
const char *html_base PARAMS ((void));
|
||||||
|
char *html_decode_entities PARAMS ((const char *, const char *));
|
||||||
uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *));
|
uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *));
|
||||||
|
|
||||||
#endif /* HTML_H */
|
#endif /* HTML_H */
|
||||||
|
46
src/url.c
46
src/url.c
@ -869,6 +869,7 @@ get_urls_html (const char *file, const char *this_url, int silent,
|
|||||||
const char *pbuf = buf;
|
const char *pbuf = buf;
|
||||||
char *constr, *base;
|
char *constr, *base;
|
||||||
const char *cbase;
|
const char *cbase;
|
||||||
|
char *needs_freeing, *url_data;
|
||||||
|
|
||||||
first_time = 0;
|
first_time = 0;
|
||||||
|
|
||||||
@ -889,16 +890,27 @@ get_urls_html (const char *file, const char *this_url, int silent,
|
|||||||
if (!size)
|
if (!size)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/* It would be nice if we could avoid allocating memory in this
|
||||||
|
loop, but I don't see an easy way. To process the entities,
|
||||||
|
we need to either copy the data, or change it destructively.
|
||||||
|
I choose the former.
|
||||||
|
|
||||||
|
We have two pointers: needs_freeing and url_data, because the
|
||||||
|
code below does thing like url_data += <something>, and we
|
||||||
|
want to pass the original string to free(). */
|
||||||
|
needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
|
||||||
|
size = strlen (url_data);
|
||||||
|
|
||||||
for (i = 0; protostrings[i]; i++)
|
for (i = 0; protostrings[i]; i++)
|
||||||
{
|
{
|
||||||
if (!strncasecmp (protostrings[i], pbuf,
|
if (!strncasecmp (protostrings[i], url_data,
|
||||||
MINVAL (strlen (protostrings[i]), size)))
|
MINVAL (strlen (protostrings[i]), size)))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Check for http:RELATIVE_URI. See below for details. */
|
/* Check for http:RELATIVE_URI. See below for details. */
|
||||||
if (protostrings[i]
|
if (protostrings[i]
|
||||||
&& !(strncasecmp (pbuf, "http:", 5) == 0
|
&& !(strncasecmp (url_data, "http:", 5) == 0
|
||||||
&& strncasecmp (pbuf, "http://", 7) != 0))
|
&& strncasecmp (url_data, "http://", 7) != 0))
|
||||||
{
|
{
|
||||||
no_proto = 0;
|
no_proto = 0;
|
||||||
}
|
}
|
||||||
@ -909,20 +921,23 @@ get_urls_html (const char *file, const char *this_url, int silent,
|
|||||||
relative URI-s as <a href="http:URL">. Just strip off the
|
relative URI-s as <a href="http:URL">. Just strip off the
|
||||||
silly leading "http:" (as well as any leading blanks
|
silly leading "http:" (as well as any leading blanks
|
||||||
before it). */
|
before it). */
|
||||||
if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
|
if ((size > 5) && !strncasecmp ("http:", url_data, 5))
|
||||||
pbuf += 5, size -= 5;
|
url_data += 5, size -= 5;
|
||||||
}
|
}
|
||||||
if (!no_proto)
|
if (!no_proto)
|
||||||
{
|
{
|
||||||
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
|
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
|
||||||
{
|
{
|
||||||
if (!strncasecmp (sup_protos[i].name, pbuf,
|
if (!strncasecmp (sup_protos[i].name, url_data,
|
||||||
MINVAL (strlen (sup_protos[i].name), size)))
|
MINVAL (strlen (sup_protos[i].name), size)))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Do *not* accept a non-supported protocol. */
|
/* Do *not* accept a non-supported protocol. */
|
||||||
if (i == ARRAY_SIZE (sup_protos))
|
if (i == ARRAY_SIZE (sup_protos))
|
||||||
continue;
|
{
|
||||||
|
free (needs_freeing);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (no_proto)
|
if (no_proto)
|
||||||
{
|
{
|
||||||
@ -945,13 +960,14 @@ get_urls_html (const char *file, const char *this_url, int silent,
|
|||||||
/* Use malloc, not alloca because this is called in
|
/* Use malloc, not alloca because this is called in
|
||||||
a loop. */
|
a loop. */
|
||||||
char *temp = (char *)malloc (size + 1);
|
char *temp = (char *)malloc (size + 1);
|
||||||
strncpy (temp, pbuf, size);
|
strncpy (temp, url_data, size);
|
||||||
temp[size] = '\0';
|
temp[size] = '\0';
|
||||||
logprintf (LOG_NOTQUIET,
|
logprintf (LOG_NOTQUIET,
|
||||||
_("Error (%s): Link %s without a base provided.\n"),
|
_("Error (%s): Link %s without a base provided.\n"),
|
||||||
file, temp);
|
file, temp);
|
||||||
free (temp);
|
free (temp);
|
||||||
}
|
}
|
||||||
|
free (needs_freeing);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (this_url)
|
if (this_url)
|
||||||
@ -966,17 +982,18 @@ get_urls_html (const char *file, const char *this_url, int silent,
|
|||||||
logprintf (LOG_NOTQUIET, _("\
|
logprintf (LOG_NOTQUIET, _("\
|
||||||
Error (%s): Base %s relative, without referer URL.\n"),
|
Error (%s): Base %s relative, without referer URL.\n"),
|
||||||
file, cbase);
|
file, cbase);
|
||||||
|
free (needs_freeing);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
base = xstrdup (cbase);
|
base = xstrdup (cbase);
|
||||||
}
|
}
|
||||||
constr = construct (base, pbuf, size, no_proto);
|
constr = construct (base, url_data, size, no_proto);
|
||||||
free (base);
|
free (base);
|
||||||
}
|
}
|
||||||
else /* has proto */
|
else /* has proto */
|
||||||
{
|
{
|
||||||
constr = (char *)xmalloc (size + 1);
|
constr = (char *)xmalloc (size + 1);
|
||||||
strncpy (constr, pbuf, size);
|
strncpy (constr, url_data, size);
|
||||||
constr[size] = '\0';
|
constr[size] = '\0';
|
||||||
}
|
}
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -988,7 +1005,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
|
|||||||
tmp2 = html_base ();
|
tmp2 = html_base ();
|
||||||
/* Use malloc, not alloca because this is called in a loop. */
|
/* Use malloc, not alloca because this is called in a loop. */
|
||||||
tmp = (char *)xmalloc (size + 1);
|
tmp = (char *)xmalloc (size + 1);
|
||||||
strncpy (tmp, pbuf, size);
|
strncpy (tmp, url_data, size);
|
||||||
tmp[size] = '\0';
|
tmp[size] = '\0';
|
||||||
logprintf (LOG_ALWAYS,
|
logprintf (LOG_ALWAYS,
|
||||||
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
|
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
|
||||||
@ -1009,14 +1026,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
|
|||||||
memset (current, 0, sizeof (*current));
|
memset (current, 0, sizeof (*current));
|
||||||
current->next = NULL;
|
current->next = NULL;
|
||||||
current->url = constr;
|
current->url = constr;
|
||||||
current->size = size;
|
current->size = step;
|
||||||
current->pos = pbuf - orig_buf;
|
current->pos = buf - orig_buf;
|
||||||
/* A URL is relative if the host and protocol are not named,
|
/* A URL is relative if the host and protocol are not named,
|
||||||
and the name does not start with `/'. */
|
and the name does not start with `/'. */
|
||||||
if (no_proto && *pbuf != '/')
|
if (no_proto && *url_data != '/')
|
||||||
current->flags |= (URELATIVE | UNOPROTO);
|
current->flags |= (URELATIVE | UNOPROTO);
|
||||||
else if (no_proto)
|
else if (no_proto)
|
||||||
current->flags |= UNOPROTO;
|
current->flags |= UNOPROTO;
|
||||||
|
free (needs_freeing);
|
||||||
}
|
}
|
||||||
free (orig_buf);
|
free (orig_buf);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user