mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Introduce non-strict comment parsing.
This commit is contained in:
parent
de19d38434
commit
7c802e58d3
13
NEWS
13
NEWS
@ -19,7 +19,7 @@ data.
|
||||
|
||||
** Download speed shown by the progress bar is based on the data
|
||||
recently read, rather than the average speed of the entire download.
|
||||
The ETA is still based on the average speed, though.
|
||||
The ETA projection is still based on the overall average.
|
||||
|
||||
** It is now possible to connect to FTP servers through FWTK
|
||||
firewalls. Set ftp_proxy to an FTP URL, and Wget will automatically
|
||||
@ -39,6 +39,17 @@ characters such as space. You can use the new option
|
||||
--restrict-file-names to relax or strengthen these rules, which can be
|
||||
useful if you dislike the default or if you're downloading to
|
||||
non-native partitions.
|
||||
|
||||
** Handling of HTML comments has been dumbed down to conform to what
|
||||
users expect and other browsers do: instead of being treated as SGML
|
||||
declaration, a comment is terminated at the first occurrence of "-->".
|
||||
Use `--strict-comments' to revert to the old behavior.
|
||||
|
||||
** Wget now correctly handles relative URIs that begin with "//", such
|
||||
as "//img.foo.com/foo.jpg".
|
||||
|
||||
** Boolean options in `.wgetrc' and on the command line now accept
|
||||
values "yes" and "no" along with the traditional "on" and "off".
|
||||
|
||||
* Wget 1.8.2 is a bugfix release with no user-visible changes.
|
||||
|
||||
|
@ -1365,6 +1365,40 @@ To finish off this topic, it's worth knowing that Wget's idea of an
|
||||
external document link is any URL specified in an @code{<A>} tag, an
|
||||
@code{<AREA>} tag, or a @code{<LINK>} tag other than @code{<LINK
|
||||
REL="stylesheet">}.
|
||||
|
||||
@cindex HTML comments
|
||||
@cindex comments, HTML
|
||||
@item --strict-comments
|
||||
Turn on strict parsing of HTML comments. The default is to terminate
|
||||
comments at the first occurrence of @samp{-->}.
|
||||
|
||||
According to specifications, HTML comments are expressed as SGML
|
||||
@dfn{declarations}. Declaration is special markup that begins with
|
||||
@samp{<!} and ends with @samp{>}, such as @samp{<!DOCTYPE ...>}, that
|
||||
may contain comments between a pair of @samp{--} delimiters. HTML
|
||||
comments are ``empty declarations'', SGML declarations without any
|
||||
non-comment text. Therefore, @samp{<!--foo-->} is a valid comment, and
|
||||
so is @samp{<!--one-- --two-->}, but @samp{<!--1--2-->} is not.
|
||||
|
||||
On the other hand, most HTML writers don't perceive comments as anything
|
||||
other than text delimited with @samp{<!--} and @samp{-->}, which is not
|
||||
quite the same. For example, something like @samp{<!------------>}
|
||||
works as a valid comment as long as the number of dashes is a multiple
|
||||
of four (!). If not, the comment technically lasts until the next
|
||||
@samp{--}, which may be at the other end of the document. Because of
|
||||
this, many popular browsers completely ignore the specification and
|
||||
implement what users have come to expect: comments delimited with
|
||||
@samp{<!--} and @samp{-->}.
|
||||
|
||||
Until version 1.9, Wget interpreted comments strictly, which resulted in
|
||||
missing links in many web pages that displayed fine in browsers, but had
|
||||
the misfortune of containing non-compliant comments. Beginning with
|
||||
version 1.9, Wget has joined the ranks of clients that implements
|
||||
``naive'' comments, terminating each comment at the first occurrence of
|
||||
@samp{-->}.
|
||||
|
||||
If, for whatever reason, you want strict comment parsing, use this
|
||||
option to turn it on.
|
||||
@end table
|
||||
|
||||
@node Recursive Accept/Reject Options, , Recursive Retrieval Options, Invoking
|
||||
@ -2306,6 +2340,9 @@ responses---the same as @samp{-S}.
|
||||
@item span_hosts = on/off
|
||||
Same as @samp{-H}.
|
||||
|
||||
@item strict_comments = on/off
|
||||
Same as @samp{--strict-comments}.
|
||||
|
||||
@item timeout = @var{n}
|
||||
Set timeout value---the same as @samp{-T}.
|
||||
|
||||
|
@ -1,3 +1,12 @@
|
||||
2003-09-19 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* main.c (main): New option --strict-comments.
|
||||
|
||||
* html-parse.c (find_comment_end): New function: simple BM search
|
||||
for "-->".
|
||||
(map_html_tags): Use it if looking at a comment and not in strict
|
||||
comments mode.
|
||||
|
||||
2003-09-17 Aurelien Marchand <artaxerxes@users.sf.net>
|
||||
|
||||
* ftp.h: Added OS400 system in enum
|
||||
|
117
src/html-parse.c
117
src/html-parse.c
@ -1,5 +1,5 @@
|
||||
/* HTML parser for Wget.
|
||||
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
|
||||
Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
@ -344,7 +344,7 @@ array_allowed (const char **array, const char *beg, const char *end)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Originally we used to adhere to RFC1866 here, and allowed only
|
||||
/* Originally we used to adhere to rfc 1866 here, and allowed only
|
||||
letters, digits, periods, and hyphens as names (of tags or
|
||||
attributes). However, this broke too many pages which used
|
||||
proprietary or strange attributes, e.g. <img src="a.gif"
|
||||
@ -362,29 +362,13 @@ array_allowed (const char **array, const char *beg, const char *end)
|
||||
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
|
||||
&& (x) != '=' && (x) != '>' && (x) != '/')
|
||||
|
||||
/* States while advancing through comments. */
|
||||
#define AC_S_DONE 0
|
||||
#define AC_S_BACKOUT 1
|
||||
#define AC_S_BANG 2
|
||||
#define AC_S_DEFAULT 3
|
||||
#define AC_S_DCLNAME 4
|
||||
#define AC_S_DASH1 5
|
||||
#define AC_S_DASH2 6
|
||||
#define AC_S_COMMENT 7
|
||||
#define AC_S_DASH3 8
|
||||
#define AC_S_DASH4 9
|
||||
#define AC_S_QUOTE1 10
|
||||
#define AC_S_IN_QUOTE 11
|
||||
#define AC_S_QUOTE2 12
|
||||
|
||||
#ifdef STANDALONE
|
||||
static int comment_backout_count;
|
||||
#endif
|
||||
|
||||
/* Advance over an SGML declaration (the <!...> forms you find in HTML
|
||||
documents). The function returns the location after the
|
||||
declaration. The reason we need this is that HTML comments are
|
||||
expressed as comments in so-called "empty declarations".
|
||||
/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In
|
||||
strict comments mode, this is used for skipping over comments as
|
||||
well.
|
||||
|
||||
To recap: any SGML declaration may have comments associated with
|
||||
it, e.g.
|
||||
@ -398,17 +382,31 @@ static int comment_backout_count;
|
||||
<!-- have -- -- fun -->
|
||||
|
||||
Whitespace is allowed between and after the comments, but not
|
||||
before the first comment.
|
||||
before the first comment. Additionally, this function attempts to
|
||||
handle double quotes in SGML declarations correctly. */
|
||||
|
||||
Additionally, this function attempts to handle double quotes in
|
||||
SGML declarations correctly. */
|
||||
static const char *
|
||||
advance_declaration (const char *beg, const char *end)
|
||||
{
|
||||
const char *p = beg;
|
||||
char quote_char = '\0'; /* shut up, gcc! */
|
||||
char ch;
|
||||
int state = AC_S_BANG;
|
||||
|
||||
enum {
|
||||
AC_S_DONE,
|
||||
AC_S_BACKOUT,
|
||||
AC_S_BANG,
|
||||
AC_S_DEFAULT,
|
||||
AC_S_DCLNAME,
|
||||
AC_S_DASH1,
|
||||
AC_S_DASH2,
|
||||
AC_S_COMMENT,
|
||||
AC_S_DASH3,
|
||||
AC_S_DASH4,
|
||||
AC_S_QUOTE1,
|
||||
AC_S_IN_QUOTE,
|
||||
AC_S_QUOTE2,
|
||||
} state = AC_S_BANG;
|
||||
|
||||
if (beg == end)
|
||||
return beg;
|
||||
@ -547,6 +545,55 @@ advance_declaration (const char *beg, const char *end)
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Find the first occurrence of the substring "-->" in [BEG, END) and
|
||||
return the pointer to the character after the substring. If the
|
||||
substring is not found, return NULL. */
|
||||
|
||||
static const char *
|
||||
find_comment_end (const char *beg, const char *end)
|
||||
{
|
||||
/* Open-coded Boyer-Moore search for "-->". Examine the third char;
|
||||
if it's not '>' or '-', advance by three characters. Otherwise,
|
||||
look at the preceding characters and try to find a match. */
|
||||
|
||||
const char *p = beg - 1;
|
||||
|
||||
while ((p += 3) < end)
|
||||
switch (p[0])
|
||||
{
|
||||
case '>':
|
||||
if (p[-1] == '-' && p[-2] == '-')
|
||||
return p + 1;
|
||||
break;
|
||||
case '-':
|
||||
at_dash:
|
||||
if (p[-1] == '-')
|
||||
{
|
||||
at_dash_dash:
|
||||
if (++p == end) return NULL;
|
||||
switch (p[0])
|
||||
{
|
||||
case '>': return p + 1;
|
||||
case '-': goto at_dash_dash;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((p += 2) >= end) return NULL;
|
||||
switch (p[0])
|
||||
{
|
||||
case '>':
|
||||
if (p[-1] == '-')
|
||||
return p + 1;
|
||||
break;
|
||||
case '-':
|
||||
goto at_dash;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Advance P (a char pointer), with the explicit intent of being able
|
||||
to read the next character. If this is not possible, go to finish. */
|
||||
@ -638,8 +685,26 @@ map_html_tags (const char *text, int size,
|
||||
declaration). */
|
||||
if (*p == '!')
|
||||
{
|
||||
/* This is an SGML declaration -- just skip it. */
|
||||
if (!opt.strict_comments
|
||||
&& p < end + 3 && p[1] == '-' && p[2] == '-')
|
||||
{
|
||||
/* If strict comments are not enforced and if we know
|
||||
we're looking at a comment, simply look for the
|
||||
terminating "-->". Non-strict is the default because
|
||||
it works in other browsers and most HTML writers can't
|
||||
be bothered with getting the comments right. */
|
||||
const char *comment_end = find_comment_end (p + 3, end);
|
||||
if (comment_end)
|
||||
p = comment_end;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Either in strict comment mode or looking at a non-empty
|
||||
declaration. Real declarations are much less likely to
|
||||
be misused the way comments are, so advance over them
|
||||
properly regardless of strictness. */
|
||||
p = advance_declaration (p, end);
|
||||
}
|
||||
if (p == end)
|
||||
goto finish;
|
||||
goto look_for_tag;
|
||||
|
@ -207,6 +207,7 @@ static struct {
|
||||
{ "sslcheckcert", &opt.sslcheckcert, cmd_number },
|
||||
{ "sslprotocol", &opt.sslprotocol, cmd_number },
|
||||
#endif /* HAVE_SSL */
|
||||
{ "strictcomments", &opt.strict_comments, cmd_boolean },
|
||||
{ "timeout", &opt.timeout, cmd_time },
|
||||
{ "timestamping", &opt.timestamping, cmd_boolean },
|
||||
{ "tries", &opt.ntry, cmd_number_inf },
|
||||
|
@ -239,6 +239,7 @@ Recursive retrieval:\n\
|
||||
-K, --backup-converted before converting file X, back up as X.orig.\n\
|
||||
-m, --mirror shortcut option equivalent to -r -N -l inf -nr.\n\
|
||||
-p, --page-requisites get all images, etc. needed to display HTML page.\n\
|
||||
--strict-comments turn on strict (SGML) handling of HTML comments.\n\
|
||||
\n"), stdout);
|
||||
fputs (_("\
|
||||
Recursive accept/reject:\n\
|
||||
@ -302,6 +303,7 @@ main (int argc, char *const *argv)
|
||||
{ "server-response", no_argument, NULL, 'S' },
|
||||
{ "span-hosts", no_argument, NULL, 'H' },
|
||||
{ "spider", no_argument, NULL, 132 },
|
||||
{ "strict-comments", no_argument, NULL, 177 },
|
||||
{ "timestamping", no_argument, NULL, 'N' },
|
||||
{ "verbose", no_argument, NULL, 'v' },
|
||||
{ "version", no_argument, NULL, 'V' },
|
||||
@ -524,6 +526,9 @@ GNU General Public License for more details.\n"));
|
||||
case 174:
|
||||
setval ("retryconnrefused", "on");
|
||||
break;
|
||||
case 177:
|
||||
setval ("strictcomments", "on");
|
||||
break;
|
||||
|
||||
/* Options accepting an argument: */
|
||||
case 129:
|
||||
|
@ -192,6 +192,9 @@ struct options
|
||||
int restrict_files_ctrl; /* non-zero if control chars in URLs
|
||||
are restricted from appearing in
|
||||
generated file names. */
|
||||
|
||||
int strict_comments; /* whether strict SGML comments are
|
||||
enforced. */
|
||||
};
|
||||
|
||||
extern struct options opt;
|
||||
|
Loading…
Reference in New Issue
Block a user