mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Added support for hexadecimal numeric entities.
This commit is contained in:
parent
675c1a703c
commit
2e8899bc10
@ -1,3 +1,10 @@
|
|||||||
|
2003-10-02 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
|
* html-parse.c (convert_and_copy): Handle numeric entities in
|
||||||
|
hexadecimal, &#xHH.
|
||||||
|
(convert_and_copy): Copy the contents directly to the pool without
|
||||||
|
a stack-allocated intermediary.
|
||||||
|
|
||||||
2003-10-02 Hrvoje Niksic <hniksic@xemacs.org>
|
2003-10-02 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
* utils.c (alarm_set): New function; use either setitimer or alarm
|
* utils.c (alarm_set): New function; use either setitimer or alarm
|
||||||
|
107
src/html-parse.c
107
src/html-parse.c
@ -46,10 +46,7 @@ so, delete this exception statement from your version. */
|
|||||||
written some time during the Geturl 1.0 beta cycle, and was very
|
written some time during the Geturl 1.0 beta cycle, and was very
|
||||||
inefficient and buggy. It also contained some very complex code to
|
inefficient and buggy. It also contained some very complex code to
|
||||||
remember a list of parser states, because it was supposed to be
|
remember a list of parser states, because it was supposed to be
|
||||||
reentrant. The idea was that several parsers would be running
|
reentrant.
|
||||||
concurrently, and you'd have pass the function a unique ID string
|
|
||||||
(for example, the URL) by which it found the relevant parser state
|
|
||||||
and returned the next URL. Over-engineering at its best.
|
|
||||||
|
|
||||||
The second HTML parser was written for Wget 1.4 (the first version
|
The second HTML parser was written for Wget 1.4 (the first version
|
||||||
by the name `Wget'), and was a complete rewrite. Although the new
|
by the name `Wget'), and was a complete rewrite. Although the new
|
||||||
@ -110,15 +107,30 @@ so, delete this exception statement from your version. */
|
|||||||
#include "html-parse.h"
|
#include "html-parse.h"
|
||||||
|
|
||||||
#ifdef STANDALONE
|
#ifdef STANDALONE
|
||||||
|
# undef xmalloc
|
||||||
|
# undef xrealloc
|
||||||
|
# undef xfree
|
||||||
# define xmalloc malloc
|
# define xmalloc malloc
|
||||||
# define xrealloc realloc
|
# define xrealloc realloc
|
||||||
# define xfree free
|
# define xfree free
|
||||||
|
|
||||||
|
# undef ISSPACE
|
||||||
|
# undef ISDIGIT
|
||||||
|
# undef ISXDIGIT
|
||||||
|
# undef ISALPHA
|
||||||
|
# undef ISALNUM
|
||||||
|
# undef TOLOWER
|
||||||
|
# undef TOUPPER
|
||||||
|
|
||||||
# define ISSPACE(x) isspace (x)
|
# define ISSPACE(x) isspace (x)
|
||||||
# define ISDIGIT(x) isdigit (x)
|
# define ISDIGIT(x) isdigit (x)
|
||||||
|
# define ISXDIGIT(x) isxdigit (x)
|
||||||
# define ISALPHA(x) isalpha (x)
|
# define ISALPHA(x) isalpha (x)
|
||||||
# define ISALNUM(x) isalnum (x)
|
# define ISALNUM(x) isalnum (x)
|
||||||
# define TOLOWER(x) tolower (x)
|
# define TOLOWER(x) tolower (x)
|
||||||
|
# define TOUPPER(x) toupper (x)
|
||||||
|
|
||||||
|
static struct options opt;
|
||||||
#endif /* STANDALONE */
|
#endif /* STANDALONE */
|
||||||
|
|
||||||
/* Pool support. A pool is a resizable chunk of memory. It is first
|
/* Pool support. A pool is a resizable chunk of memory. It is first
|
||||||
@ -171,22 +183,20 @@ struct pool {
|
|||||||
is done. */
|
is done. */
|
||||||
|
|
||||||
#define POOL_APPEND(pool, beg, end) do { \
|
#define POOL_APPEND(pool, beg, end) do { \
|
||||||
const char *PA_beg = beg; \
|
const char *PA_beg = (beg); \
|
||||||
int PA_size = end - PA_beg; \
|
int PA_size = (end) - PA_beg; \
|
||||||
POOL_GROW (pool, PA_size); \
|
POOL_GROW (pool, PA_size); \
|
||||||
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
|
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
|
||||||
(pool).index += PA_size; \
|
(pool).index += PA_size; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* The same as the above, but with zero termination. */
|
/* Append one character to the pool. Can be used to zero-terminate
|
||||||
|
pool strings. */
|
||||||
|
|
||||||
#define POOL_APPEND_ZT(pool, beg, end) do { \
|
#define POOL_APPEND_CHR(pool, ch) do { \
|
||||||
const char *PA_beg = beg; \
|
char PAC_char = (ch); \
|
||||||
int PA_size = end - PA_beg; \
|
POOL_GROW (pool, 1); \
|
||||||
POOL_GROW (pool, PA_size + 1); \
|
(pool).contents[(pool).index++] = PAC_char; \
|
||||||
memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
|
|
||||||
(pool).contents[(pool).index + PA_size] = '\0'; \
|
|
||||||
(pool).index += PA_size + 1; \
|
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* Forget old pool contents. The allocated memory is not freed. */
|
/* Forget old pool contents. The allocated memory is not freed. */
|
||||||
@ -210,11 +220,11 @@ struct pool {
|
|||||||
|
|
||||||
#define AP_DOWNCASE 1
|
#define AP_DOWNCASE 1
|
||||||
#define AP_PROCESS_ENTITIES 2
|
#define AP_PROCESS_ENTITIES 2
|
||||||
#define AP_SKIP_BLANKS 4
|
#define AP_TRIM_BLANKS 4
|
||||||
|
|
||||||
/* Copy the text in the range [BEG, END) to POOL, optionally
|
/* Copy the text in the range [BEG, END) to POOL, optionally
|
||||||
performing operations specified by FLAGS. FLAGS may be any
|
performing operations specified by FLAGS. FLAGS may be any
|
||||||
combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
|
combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
|
||||||
with the following meaning:
|
with the following meaning:
|
||||||
|
|
||||||
* AP_DOWNCASE -- downcase all the letters;
|
* AP_DOWNCASE -- downcase all the letters;
|
||||||
@ -223,8 +233,9 @@ struct pool {
|
|||||||
the decoded string. Recognized entities are <, >, &, ",
|
the decoded string. Recognized entities are <, >, &, ",
|
||||||
  and the numerical entities.
|
  and the numerical entities.
|
||||||
|
|
||||||
* AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
|
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
|
||||||
of text. */
|
of text. */
|
||||||
|
|
||||||
static void
|
static void
|
||||||
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
|
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
|
||||||
{
|
{
|
||||||
@ -234,7 +245,7 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
|||||||
/* First, skip blanks if required. We must do this before entities
|
/* First, skip blanks if required. We must do this before entities
|
||||||
are processed, so that blanks can still be inserted as, for
|
are processed, so that blanks can still be inserted as, for
|
||||||
instance, ` '. */
|
instance, ` '. */
|
||||||
if (flags & AP_SKIP_BLANKS)
|
if (flags & AP_TRIM_BLANKS)
|
||||||
{
|
{
|
||||||
while (beg < end && ISSPACE (*beg))
|
while (beg < end && ISSPACE (*beg))
|
||||||
++beg;
|
++beg;
|
||||||
@ -245,11 +256,16 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
|||||||
|
|
||||||
if (flags & AP_PROCESS_ENTITIES)
|
if (flags & AP_PROCESS_ENTITIES)
|
||||||
{
|
{
|
||||||
/* Stack-allocate a copy of text, process entities and copy it
|
/* Grow the pool, then copy the text to the pool character by
|
||||||
to the pool. */
|
character, processing the encountered entities as we go
|
||||||
char *local_copy = (char *)alloca (size + 1);
|
along.
|
||||||
|
|
||||||
|
It's safe (and necessary) to grow the pool in advance because
|
||||||
|
processing the entities can only *shorten* the string, it can
|
||||||
|
never lengthen it. */
|
||||||
|
POOL_GROW (*pool, end - beg);
|
||||||
const char *from = beg;
|
const char *from = beg;
|
||||||
char *to = local_copy;
|
char *to = pool->contents + pool->index;
|
||||||
|
|
||||||
while (from < end)
|
while (from < end)
|
||||||
{
|
{
|
||||||
@ -260,22 +276,33 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
|||||||
const char *save = from;
|
const char *save = from;
|
||||||
int remain;
|
int remain;
|
||||||
|
|
||||||
if (++from == end) goto lose;
|
if (++from == end)
|
||||||
|
goto lose;
|
||||||
remain = end - from;
|
remain = end - from;
|
||||||
|
|
||||||
|
/* Process numeric entities "&#DDD;" and "&#xHH;". */
|
||||||
if (*from == '#')
|
if (*from == '#')
|
||||||
{
|
{
|
||||||
int numeric;
|
int numeric = 0, digits = 0;
|
||||||
++from;
|
++from;
|
||||||
if (from == end || !ISDIGIT (*from)) goto lose;
|
if (*from == 'x')
|
||||||
for (numeric = 0; from < end && ISDIGIT (*from); from++)
|
{
|
||||||
numeric = 10 * numeric + (*from) - '0';
|
++from;
|
||||||
if (from < end && ISALPHA (*from)) goto lose;
|
for (; from < end && ISXDIGIT (*from); from++, digits++)
|
||||||
|
numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (; from < end && ISDIGIT (*from); from++, digits++)
|
||||||
|
numeric = (numeric * 10) + (*from - '0');
|
||||||
|
}
|
||||||
|
if (!digits)
|
||||||
|
goto lose;
|
||||||
numeric &= 0xff;
|
numeric &= 0xff;
|
||||||
*to++ = numeric;
|
*to++ = numeric;
|
||||||
}
|
}
|
||||||
#define FROB(x) (remain >= (sizeof (x) - 1) \
|
#define FROB(x) (remain >= (sizeof (x) - 1) \
|
||||||
&& !memcmp (from, x, sizeof (x) - 1) \
|
&& 0 == memcmp (from, x, sizeof (x) - 1) \
|
||||||
&& (*(from + sizeof (x) - 1) == ';' \
|
&& (*(from + sizeof (x) - 1) == ';' \
|
||||||
|| remain == sizeof (x) - 1 \
|
|| remain == sizeof (x) - 1 \
|
||||||
|| !ISALNUM (*(from + sizeof (x) - 1))))
|
|| !ISALNUM (*(from + sizeof (x) - 1))))
|
||||||
@ -309,13 +336,20 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
|||||||
*to++ = *from++;
|
*to++ = *from++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*to++ = '\0';
|
/* Verify that we haven't exceeded the original size. (It
|
||||||
POOL_APPEND (*pool, local_copy, to);
|
shouldn't happen, hence the assert.) */
|
||||||
|
assert (to - (pool->contents + pool->index) <= end - beg);
|
||||||
|
|
||||||
|
/* Make POOL's tail point to the position following the string
|
||||||
|
we've written. */
|
||||||
|
pool->index = to - pool->contents;
|
||||||
|
POOL_APPEND_CHR (*pool, '\0');
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* Just copy the text to the pool. */
|
/* Just copy the text to the pool. */
|
||||||
POOL_APPEND_ZT (*pool, beg, end);
|
POOL_APPEND (*pool, beg, end);
|
||||||
|
POOL_APPEND_CHR (*pool, '\0');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & AP_DOWNCASE)
|
if (flags & AP_DOWNCASE)
|
||||||
@ -822,10 +856,11 @@ map_html_tags (const char *text, int size,
|
|||||||
goto look_for_tag;
|
goto look_for_tag;
|
||||||
attr_raw_value_end = p; /* <foo bar="baz"> */
|
attr_raw_value_end = p; /* <foo bar="baz"> */
|
||||||
/* ^ */
|
/* ^ */
|
||||||
/* The AP_SKIP_BLANKS part is not entirely correct,
|
/* The AP_TRIM_BLANKS is there for buggy HTML
|
||||||
because we don't want to skip blanks for all the
|
generators that generate <a href=" foo"> instead of
|
||||||
attribute values. */
|
<a href="foo"> (Netscape ignores spaces as well.)
|
||||||
operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
|
If you really mean space, use &32; or %20. */
|
||||||
|
operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user