1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Cope better with unclosed html tags.

This commit is contained in:
Giuseppe Scrivano 2010-05-30 14:01:10 +02:00
parent 05fccaeed2
commit 1b2092fd06
3 changed files with 21 additions and 9 deletions

2
NEWS
View File

@ -19,6 +19,8 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
** Set new cookies after an authorization failure. ** Set new cookies after an authorization failure.
** Exit with failure if -k is specified and -O is not a regular file. ** Exit with failure if -k is specified and -O is not a regular file.
** Cope better with unclosed html tags.
* Changes in Wget 1.12 * Changes in Wget 1.12

View File

@ -1,3 +1,9 @@
2010-05-30 Giuseppe Scrivano <gscrivano@gnu.org>
* html-parse.c (NAME_CHAR_P): Consider '<' an invalid character.
(advance_declaration): Close the tag if '<' is found.
(map_html_tags): Likewise.
2010-05-27 Giuseppe Scrivano <gscrivano@gnu.org> 2010-05-27 Giuseppe Scrivano <gscrivano@gnu.org>
* main.c (main): Exit with failure when -k is specified and -O is not * main.c (main): Exit with failure when -k is specified and -O is not

View File

@ -528,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
* whitespace * whitespace
* 8-bit and control chars * 8-bit and control chars
* characters that clearly cannot be part of name: * characters that clearly cannot be part of name:
'=', '>', '/'. '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */ an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
&& (x) != '=' && (x) != '>' && (x) != '/') && (x) != '=' && (x) != '<' && (x) != '>' \
&& (x) != '/')
#ifdef STANDALONE #ifdef STANDALONE
static int comment_backout_count; static int comment_backout_count;
@ -619,6 +620,7 @@ advance_declaration (const char *beg, const char *end)
case '\n': case '\n':
ch = *p++; ch = *p++;
break; break;
case '<':
case '>': case '>':
state = AC_S_DONE; state = AC_S_DONE;
break; break;
@ -926,7 +928,7 @@ map_html_tags (const char *text, int size,
} }
} }
if (end_tag && *p != '>') if (end_tag && *p != '>' && *p != '<')
goto backout_tag; goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
@ -958,12 +960,12 @@ map_html_tags (const char *text, int size,
/* ^ */ /* ^ */
ADVANCE (p); ADVANCE (p);
SKIP_WS (p); SKIP_WS (p);
if (*p != '>') if (*p != '<' || *p != '>')
goto backout_tag; goto backout_tag;
} }
/* Check for end of tag definition. */ /* Check for end of tag definition. */
if (*p == '>') if (*p == '<' || *p == '>')
break; break;
/* Establish bounds of attribute name. */ /* Establish bounds of attribute name. */
@ -978,7 +980,8 @@ map_html_tags (const char *text, int size,
/* Establish bounds of attribute value. */ /* Establish bounds of attribute value. */
SKIP_WS (p); SKIP_WS (p);
if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{ {
/* Minimized attribute syntax allows `=' to be omitted. /* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL For example, <UL COMPACT> is a valid shorthand for <UL
@ -1015,7 +1018,7 @@ map_html_tags (const char *text, int size,
newline_seen = true; newline_seen = true;
continue; continue;
} }
else if (newline_seen && *p == '>') else if (newline_seen && (*p == '<' || *p == '>'))
break; break;
ADVANCE (p); ADVANCE (p);
} }
@ -1040,7 +1043,7 @@ map_html_tags (const char *text, int size,
violated by, for instance, `%' in `width=75%'. violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as We'll be liberal and allow just about anything as
an attribute value. */ an attribute value. */
while (!c_isspace (*p) && *p != '>') while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p); ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */ attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */ /* ^ */
@ -1138,7 +1141,8 @@ map_html_tags (const char *text, int size,
} }
mapfun (&taginfo, maparg); mapfun (&taginfo, maparg);
ADVANCE (p); if (*p != '<')
ADVANCE (p);
} }
goto look_for_tag; goto look_for_tag;