1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Added support for hexadecimal numeric entities.

This commit is contained in:
hniksic 2003-10-02 10:23:25 -07:00
parent 675c1a703c
commit 2e8899bc10
2 changed files with 78 additions and 36 deletions

View File

@ -1,3 +1,10 @@
2003-10-02 Hrvoje Niksic <hniksic@xemacs.org>
* html-parse.c (convert_and_copy): Handle numeric entities in
hexadecimal, &#xHH.
(convert_and_copy): Copy the contents directly to the pool without
a stack-allocated intermediary.
2003-10-02 Hrvoje Niksic <hniksic@xemacs.org> 2003-10-02 Hrvoje Niksic <hniksic@xemacs.org>
* utils.c (alarm_set): New function; use either setitimer or alarm * utils.c (alarm_set): New function; use either setitimer or alarm

View File

@ -46,10 +46,7 @@ so, delete this exception statement from your version. */
written some time during the Geturl 1.0 beta cycle, and was very written some time during the Geturl 1.0 beta cycle, and was very
inefficient and buggy. It also contained some very complex code to inefficient and buggy. It also contained some very complex code to
remember a list of parser states, because it was supposed to be remember a list of parser states, because it was supposed to be
reentrant. The idea was that several parsers would be running reentrant.
concurrently, and you'd have pass the function a unique ID string
(for example, the URL) by which it found the relevant parser state
and returned the next URL. Over-engineering at its best.
The second HTML parser was written for Wget 1.4 (the first version The second HTML parser was written for Wget 1.4 (the first version
by the name `Wget'), and was a complete rewrite. Although the new by the name `Wget'), and was a complete rewrite. Although the new
@ -110,15 +107,30 @@ so, delete this exception statement from your version. */
#include "html-parse.h" #include "html-parse.h"
#ifdef STANDALONE #ifdef STANDALONE
# undef xmalloc
# undef xrealloc
# undef xfree
# define xmalloc malloc # define xmalloc malloc
# define xrealloc realloc # define xrealloc realloc
# define xfree free # define xfree free
# undef ISSPACE
# undef ISDIGIT
# undef ISXDIGIT
# undef ISALPHA
# undef ISALNUM
# undef TOLOWER
# undef TOUPPER
# define ISSPACE(x) isspace (x) # define ISSPACE(x) isspace (x)
# define ISDIGIT(x) isdigit (x) # define ISDIGIT(x) isdigit (x)
# define ISXDIGIT(x) isxdigit (x)
# define ISALPHA(x) isalpha (x) # define ISALPHA(x) isalpha (x)
# define ISALNUM(x) isalnum (x) # define ISALNUM(x) isalnum (x)
# define TOLOWER(x) tolower (x) # define TOLOWER(x) tolower (x)
# define TOUPPER(x) toupper (x)
static struct options opt;
#endif /* STANDALONE */ #endif /* STANDALONE */
/* Pool support. A pool is a resizable chunk of memory. It is first /* Pool support. A pool is a resizable chunk of memory. It is first
@ -171,22 +183,20 @@ struct pool {
is done. */ is done. */
#define POOL_APPEND(pool, beg, end) do { \ #define POOL_APPEND(pool, beg, end) do { \
const char *PA_beg = beg; \ const char *PA_beg = (beg); \
int PA_size = end - PA_beg; \ int PA_size = (end) - PA_beg; \
POOL_GROW (pool, PA_size); \ POOL_GROW (pool, PA_size); \
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \ memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
(pool).index += PA_size; \ (pool).index += PA_size; \
} while (0) } while (0)
/* The same as the above, but with zero termination. */ /* Append one character to the pool. Can be used to zero-terminate
pool strings. */
#define POOL_APPEND_ZT(pool, beg, end) do { \ #define POOL_APPEND_CHR(pool, ch) do { \
const char *PA_beg = beg; \ char PAC_char = (ch); \
int PA_size = end - PA_beg; \ POOL_GROW (pool, 1); \
POOL_GROW (pool, PA_size + 1); \ (pool).contents[(pool).index++] = PAC_char; \
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
(pool).contents[(pool).index + PA_size] = '\0'; \
(pool).index += PA_size + 1; \
} while (0) } while (0)
/* Forget old pool contents. The allocated memory is not freed. */ /* Forget old pool contents. The allocated memory is not freed. */
@ -210,11 +220,11 @@ struct pool {
#define AP_DOWNCASE 1 #define AP_DOWNCASE 1
#define AP_PROCESS_ENTITIES 2 #define AP_PROCESS_ENTITIES 2
#define AP_SKIP_BLANKS 4 #define AP_TRIM_BLANKS 4
/* Copy the text in the range [BEG, END) to POOL, optionally /* Copy the text in the range [BEG, END) to POOL, optionally
performing operations specified by FLAGS. FLAGS may be any performing operations specified by FLAGS. FLAGS may be any
combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
with the following meaning: with the following meaning:
* AP_DOWNCASE -- downcase all the letters; * AP_DOWNCASE -- downcase all the letters;
@ -223,8 +233,9 @@ struct pool {
the decoded string. Recognized entities are &lt, &gt, &amp, &quot, the decoded string. Recognized entities are &lt, &gt, &amp, &quot,
&nbsp and the numerical entities. &nbsp and the numerical entities.
* AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
of text. */ of text. */
static void static void
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
{ {
@ -234,7 +245,7 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
/* First, skip blanks if required. We must do this before entities /* First, skip blanks if required. We must do this before entities
are processed, so that blanks can still be inserted as, for are processed, so that blanks can still be inserted as, for
instance, `&#32;'. */ instance, `&#32;'. */
if (flags & AP_SKIP_BLANKS) if (flags & AP_TRIM_BLANKS)
{ {
while (beg < end && ISSPACE (*beg)) while (beg < end && ISSPACE (*beg))
++beg; ++beg;
@ -245,11 +256,16 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
if (flags & AP_PROCESS_ENTITIES) if (flags & AP_PROCESS_ENTITIES)
{ {
/* Stack-allocate a copy of text, process entities and copy it /* Grow the pool, then copy the text to the pool character by
to the pool. */ character, processing the encountered entities as we go
char *local_copy = (char *)alloca (size + 1); along.
It's safe (and necessary) to grow the pool in advance because
processing the entities can only *shorten* the string, it can
never lengthen it. */
POOL_GROW (*pool, end - beg);
const char *from = beg; const char *from = beg;
char *to = local_copy; char *to = pool->contents + pool->index;
while (from < end) while (from < end)
{ {
@ -260,22 +276,33 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
const char *save = from; const char *save = from;
int remain; int remain;
if (++from == end) goto lose; if (++from == end)
goto lose;
remain = end - from; remain = end - from;
/* Process numeric entities "&#DDD;" and "&#xHH;". */
if (*from == '#') if (*from == '#')
{ {
int numeric; int numeric = 0, digits = 0;
++from; ++from;
if (from == end || !ISDIGIT (*from)) goto lose; if (*from == 'x')
for (numeric = 0; from < end && ISDIGIT (*from); from++) {
numeric = 10 * numeric + (*from) - '0'; ++from;
if (from < end && ISALPHA (*from)) goto lose; for (; from < end && ISXDIGIT (*from); from++, digits++)
numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
}
else
{
for (; from < end && ISDIGIT (*from); from++, digits++)
numeric = (numeric * 10) + (*from - '0');
}
if (!digits)
goto lose;
numeric &= 0xff; numeric &= 0xff;
*to++ = numeric; *to++ = numeric;
} }
#define FROB(x) (remain >= (sizeof (x) - 1) \ #define FROB(x) (remain >= (sizeof (x) - 1) \
&& !memcmp (from, x, sizeof (x) - 1) \ && 0 == memcmp (from, x, sizeof (x) - 1) \
&& (*(from + sizeof (x) - 1) == ';' \ && (*(from + sizeof (x) - 1) == ';' \
|| remain == sizeof (x) - 1 \ || remain == sizeof (x) - 1 \
|| !ISALNUM (*(from + sizeof (x) - 1)))) || !ISALNUM (*(from + sizeof (x) - 1))))
@ -309,13 +336,20 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
*to++ = *from++; *to++ = *from++;
} }
} }
*to++ = '\0'; /* Verify that we haven't exceeded the original size. (It
POOL_APPEND (*pool, local_copy, to); shouldn't happen, hence the assert.) */
assert (to - (pool->contents + pool->index) <= end - beg);
/* Make POOL's tail point to the position following the string
we've written. */
pool->index = to - pool->contents;
POOL_APPEND_CHR (*pool, '\0');
} }
else else
{ {
/* Just copy the text to the pool. */ /* Just copy the text to the pool. */
POOL_APPEND_ZT (*pool, beg, end); POOL_APPEND (*pool, beg, end);
POOL_APPEND_CHR (*pool, '\0');
} }
if (flags & AP_DOWNCASE) if (flags & AP_DOWNCASE)
@ -822,10 +856,11 @@ map_html_tags (const char *text, int size,
goto look_for_tag; goto look_for_tag;
attr_raw_value_end = p; /* <foo bar="baz"> */ attr_raw_value_end = p; /* <foo bar="baz"> */
/* ^ */ /* ^ */
/* The AP_SKIP_BLANKS part is not entirely correct, /* The AP_TRIM_BLANKS is there for buggy HTML
because we don't want to skip blanks for all the generators that generate <a href=" foo"> instead of
attribute values. */ <a href="foo"> (Netscape ignores spaces as well.)
operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS; If you really mean space, use &32; or %20. */
operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
} }
else else
{ {