From 1b2092fd06edfe1b752bb9e0308262d843419794 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 30 May 2010 14:01:10 +0200 Subject: [PATCH] Cope better with unclosed html tags. --- NEWS | 2 ++ src/ChangeLog | 6 ++++++ src/html-parse.c | 22 +++++++++++++--------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index 6faae62e..6f20a8b7 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,8 @@ Please send GNU Wget bug reports to . ** Set new cookies after an authorization failure. ** Exit with failure if -k is specified and -O is not a regular file. + +** Cope better with unclosed html tags. * Changes in Wget 1.12 diff --git a/src/ChangeLog b/src/ChangeLog index a84ff378..e5cc8cd1 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,9 @@ +2010-05-30 Giuseppe Scrivano + + * html-parse.c (NAME_CHAR_P): Consider '<' an invalid character. + (advance_declaration): Close the tag if '<' is found. + (map_html_tags): Likewise. + 2010-05-27 Giuseppe Scrivano * main.c (main): Exit with failure when -k is specified and -O is not diff --git a/src/html-parse.c b/src/html-parse.c index 070913f4..4cd86b93 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -528,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags * whitespace * 8-bit and control chars * characters that clearly cannot be part of name: - '=', '>', '/'. + '=', '<', '>', '/'. This only affects attribute and tag names; attribute values allow an even greater variety of characters. */ #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ - && (x) != '=' && (x) != '>' && (x) != '/') + && (x) != '=' && (x) != '<' && (x) != '>' \ + && (x) != '/') #ifdef STANDALONE static int comment_backout_count; @@ -619,6 +620,7 @@ advance_declaration (const char *beg, const char *end) case '\n': ch = *p++; break; + case '<': case '>': state = AC_S_DONE; break; @@ -926,7 +928,7 @@ map_html_tags (const char *text, int size, } } - if (end_tag && *p != '>') + if (end_tag && *p != '>' && *p != '<') goto backout_tag; if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) @@ -958,12 +960,12 @@ map_html_tags (const char *text, int size, /* ^ */ ADVANCE (p); SKIP_WS (p); - if (*p != '>') + if (*p != '<' || *p != '>') goto backout_tag; } /* Check for end of tag definition. */ - if (*p == '>') + if (*p == '<' || *p == '>') break; /* Establish bounds of attribute name. */ @@ -978,7 +980,8 @@ map_html_tags (const char *text, int size, /* Establish bounds of attribute value. */ SKIP_WS (p); - if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') + + if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>') { /* Minimized attribute syntax allows `=' to be omitted. For example,
    is a valid shorthand for
      ') + else if (newline_seen && (*p == '<' || *p == '>')) break; ADVANCE (p); } @@ -1040,7 +1043,7 @@ map_html_tags (const char *text, int size, violated by, for instance, `%' in `width=75%'. We'll be liberal and allow just about anything as an attribute value. */ - while (!c_isspace (*p) && *p != '>') + while (!c_isspace (*p) && *p != '<' && *p != '>') ADVANCE (p); attr_value_end = p; /* */ /* ^ */ @@ -1138,7 +1141,8 @@ map_html_tags (const char *text, int size, } mapfun (&taginfo, maparg); - ADVANCE (p); + if (*p != '<') + ADVANCE (p); } goto look_for_tag;