mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Introduce non-strict comment parsing.
This commit is contained in:
parent
de19d38434
commit
7c802e58d3
13
NEWS
13
NEWS
@ -19,7 +19,7 @@ data.
|
|||||||
|
|
||||||
** Download speed shown by the progress bar is based on the data
|
** Download speed shown by the progress bar is based on the data
|
||||||
recently read, rather than the average speed of the entire download.
|
recently read, rather than the average speed of the entire download.
|
||||||
The ETA is still based on the average speed, though.
|
The ETA projection is still based on the overall average.
|
||||||
|
|
||||||
** It is now possible to connect to FTP servers through FWTK
|
** It is now possible to connect to FTP servers through FWTK
|
||||||
firewalls. Set ftp_proxy to an FTP URL, and Wget will automatically
|
firewalls. Set ftp_proxy to an FTP URL, and Wget will automatically
|
||||||
@ -39,6 +39,17 @@ characters such as space. You can use the new option
|
|||||||
--restrict-file-names to relax or strengthen these rules, which can be
|
--restrict-file-names to relax or strengthen these rules, which can be
|
||||||
useful if you dislike the default or if you're downloading to
|
useful if you dislike the default or if you're downloading to
|
||||||
non-native partitions.
|
non-native partitions.
|
||||||
|
|
||||||
|
** Handling of HTML comments has been dumbed down to conform to what
|
||||||
|
users expect and other browsers do: instead of being treated as SGML
|
||||||
|
declaration, a comment is terminated at the first occurrence of "-->".
|
||||||
|
Use `--strict-comments' to revert to the old behavior.
|
||||||
|
|
||||||
|
** Wget now correctly handles relative URIs that begin with "//", such
|
||||||
|
as "//img.foo.com/foo.jpg".
|
||||||
|
|
||||||
|
** Boolean options in `.wgetrc' and on the command line now accept
|
||||||
|
values "yes" and "no" along with the traditional "on" and "off".
|
||||||
|
|
||||||
* Wget 1.8.2 is a bugfix release with no user-visible changes.
|
* Wget 1.8.2 is a bugfix release with no user-visible changes.
|
||||||
|
|
||||||
|
@ -1365,6 +1365,40 @@ To finish off this topic, it's worth knowing that Wget's idea of an
|
|||||||
external document link is any URL specified in an @code{<A>} tag, an
|
external document link is any URL specified in an @code{<A>} tag, an
|
||||||
@code{<AREA>} tag, or a @code{<LINK>} tag other than @code{<LINK
|
@code{<AREA>} tag, or a @code{<LINK>} tag other than @code{<LINK
|
||||||
REL="stylesheet">}.
|
REL="stylesheet">}.
|
||||||
|
|
||||||
|
@cindex HTML comments
|
||||||
|
@cindex comments, HTML
|
||||||
|
@item --strict-comments
|
||||||
|
Turn on strict parsing of HTML comments. The default is to terminate
|
||||||
|
comments at the first occurrence of @samp{-->}.
|
||||||
|
|
||||||
|
According to specifications, HTML comments are expressed as SGML
|
||||||
|
@dfn{declarations}. Declaration is special markup that begins with
|
||||||
|
@samp{<!} and ends with @samp{>}, such as @samp{<!DOCTYPE ...>}, that
|
||||||
|
may contain comments between a pair of @samp{--} delimiters. HTML
|
||||||
|
comments are ``empty declarations'', SGML declarations without any
|
||||||
|
non-comment text. Therefore, @samp{<!--foo-->} is a valid comment, and
|
||||||
|
so is @samp{<!--one-- --two-->}, but @samp{<!--1--2-->} is not.
|
||||||
|
|
||||||
|
On the other hand, most HTML writers don't perceive comments as anything
|
||||||
|
other than text delimited with @samp{<!--} and @samp{-->}, which is not
|
||||||
|
quite the same. For example, something like @samp{<!------------>}
|
||||||
|
works as a valid comment as long as the number of dashes is a multiple
|
||||||
|
of four (!). If not, the comment technically lasts until the next
|
||||||
|
@samp{--}, which may be at the other end of the document. Because of
|
||||||
|
this, many popular browsers completely ignore the specification and
|
||||||
|
implement what users have come to expect: comments delimited with
|
||||||
|
@samp{<!--} and @samp{-->}.
|
||||||
|
|
||||||
|
Until version 1.9, Wget interpreted comments strictly, which resulted in
|
||||||
|
missing links in many web pages that displayed fine in browsers, but had
|
||||||
|
the misfortune of containing non-compliant comments. Beginning with
|
||||||
|
version 1.9, Wget has joined the ranks of clients that implements
|
||||||
|
``naive'' comments, terminating each comment at the first occurrence of
|
||||||
|
@samp{-->}.
|
||||||
|
|
||||||
|
If, for whatever reason, you want strict comment parsing, use this
|
||||||
|
option to turn it on.
|
||||||
@end table
|
@end table
|
||||||
|
|
||||||
@node Recursive Accept/Reject Options, , Recursive Retrieval Options, Invoking
|
@node Recursive Accept/Reject Options, , Recursive Retrieval Options, Invoking
|
||||||
@ -2306,6 +2340,9 @@ responses---the same as @samp{-S}.
|
|||||||
@item span_hosts = on/off
|
@item span_hosts = on/off
|
||||||
Same as @samp{-H}.
|
Same as @samp{-H}.
|
||||||
|
|
||||||
|
@item strict_comments = on/off
|
||||||
|
Same as @samp{--strict-comments}.
|
||||||
|
|
||||||
@item timeout = @var{n}
|
@item timeout = @var{n}
|
||||||
Set timeout value---the same as @samp{-T}.
|
Set timeout value---the same as @samp{-T}.
|
||||||
|
|
||||||
|
@ -1,3 +1,12 @@
|
|||||||
|
2003-09-19 Hrvoje Niksic <hniksic@xemacs.org>
|
||||||
|
|
||||||
|
* main.c (main): New option --strict-comments.
|
||||||
|
|
||||||
|
* html-parse.c (find_comment_end): New function: simple BM search
|
||||||
|
for "-->".
|
||||||
|
(map_html_tags): Use it if looking at a comment and not in strict
|
||||||
|
comments mode.
|
||||||
|
|
||||||
2003-09-17 Aurelien Marchand <artaxerxes@users.sf.net>
|
2003-09-17 Aurelien Marchand <artaxerxes@users.sf.net>
|
||||||
|
|
||||||
* ftp.h: Added OS400 system in enum
|
* ftp.h: Added OS400 system in enum
|
||||||
|
117
src/html-parse.c
117
src/html-parse.c
@ -1,5 +1,5 @@
|
|||||||
/* HTML parser for Wget.
|
/* HTML parser for Wget.
|
||||||
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
|
Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
|
||||||
|
|
||||||
This file is part of GNU Wget.
|
This file is part of GNU Wget.
|
||||||
|
|
||||||
@ -344,7 +344,7 @@ array_allowed (const char **array, const char *beg, const char *end)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Originally we used to adhere to RFC1866 here, and allowed only
|
/* Originally we used to adhere to rfc 1866 here, and allowed only
|
||||||
letters, digits, periods, and hyphens as names (of tags or
|
letters, digits, periods, and hyphens as names (of tags or
|
||||||
attributes). However, this broke too many pages which used
|
attributes). However, this broke too many pages which used
|
||||||
proprietary or strange attributes, e.g. <img src="a.gif"
|
proprietary or strange attributes, e.g. <img src="a.gif"
|
||||||
@ -362,29 +362,13 @@ array_allowed (const char **array, const char *beg, const char *end)
|
|||||||
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
|
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
|
||||||
&& (x) != '=' && (x) != '>' && (x) != '/')
|
&& (x) != '=' && (x) != '>' && (x) != '/')
|
||||||
|
|
||||||
/* States while advancing through comments. */
|
|
||||||
#define AC_S_DONE 0
|
|
||||||
#define AC_S_BACKOUT 1
|
|
||||||
#define AC_S_BANG 2
|
|
||||||
#define AC_S_DEFAULT 3
|
|
||||||
#define AC_S_DCLNAME 4
|
|
||||||
#define AC_S_DASH1 5
|
|
||||||
#define AC_S_DASH2 6
|
|
||||||
#define AC_S_COMMENT 7
|
|
||||||
#define AC_S_DASH3 8
|
|
||||||
#define AC_S_DASH4 9
|
|
||||||
#define AC_S_QUOTE1 10
|
|
||||||
#define AC_S_IN_QUOTE 11
|
|
||||||
#define AC_S_QUOTE2 12
|
|
||||||
|
|
||||||
#ifdef STANDALONE
|
#ifdef STANDALONE
|
||||||
static int comment_backout_count;
|
static int comment_backout_count;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Advance over an SGML declaration (the <!...> forms you find in HTML
|
/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In
|
||||||
documents). The function returns the location after the
|
strict comments mode, this is used for skipping over comments as
|
||||||
declaration. The reason we need this is that HTML comments are
|
well.
|
||||||
expressed as comments in so-called "empty declarations".
|
|
||||||
|
|
||||||
To recap: any SGML declaration may have comments associated with
|
To recap: any SGML declaration may have comments associated with
|
||||||
it, e.g.
|
it, e.g.
|
||||||
@ -398,17 +382,31 @@ static int comment_backout_count;
|
|||||||
<!-- have -- -- fun -->
|
<!-- have -- -- fun -->
|
||||||
|
|
||||||
Whitespace is allowed between and after the comments, but not
|
Whitespace is allowed between and after the comments, but not
|
||||||
before the first comment.
|
before the first comment. Additionally, this function attempts to
|
||||||
|
handle double quotes in SGML declarations correctly. */
|
||||||
|
|
||||||
Additionally, this function attempts to handle double quotes in
|
|
||||||
SGML declarations correctly. */
|
|
||||||
static const char *
|
static const char *
|
||||||
advance_declaration (const char *beg, const char *end)
|
advance_declaration (const char *beg, const char *end)
|
||||||
{
|
{
|
||||||
const char *p = beg;
|
const char *p = beg;
|
||||||
char quote_char = '\0'; /* shut up, gcc! */
|
char quote_char = '\0'; /* shut up, gcc! */
|
||||||
char ch;
|
char ch;
|
||||||
int state = AC_S_BANG;
|
|
||||||
|
enum {
|
||||||
|
AC_S_DONE,
|
||||||
|
AC_S_BACKOUT,
|
||||||
|
AC_S_BANG,
|
||||||
|
AC_S_DEFAULT,
|
||||||
|
AC_S_DCLNAME,
|
||||||
|
AC_S_DASH1,
|
||||||
|
AC_S_DASH2,
|
||||||
|
AC_S_COMMENT,
|
||||||
|
AC_S_DASH3,
|
||||||
|
AC_S_DASH4,
|
||||||
|
AC_S_QUOTE1,
|
||||||
|
AC_S_IN_QUOTE,
|
||||||
|
AC_S_QUOTE2,
|
||||||
|
} state = AC_S_BANG;
|
||||||
|
|
||||||
if (beg == end)
|
if (beg == end)
|
||||||
return beg;
|
return beg;
|
||||||
@ -547,6 +545,55 @@ advance_declaration (const char *beg, const char *end)
|
|||||||
}
|
}
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Find the first occurrence of the substring "-->" in [BEG, END) and
|
||||||
|
return the pointer to the character after the substring. If the
|
||||||
|
substring is not found, return NULL. */
|
||||||
|
|
||||||
|
static const char *
|
||||||
|
find_comment_end (const char *beg, const char *end)
|
||||||
|
{
|
||||||
|
/* Open-coded Boyer-Moore search for "-->". Examine the third char;
|
||||||
|
if it's not '>' or '-', advance by three characters. Otherwise,
|
||||||
|
look at the preceding characters and try to find a match. */
|
||||||
|
|
||||||
|
const char *p = beg - 1;
|
||||||
|
|
||||||
|
while ((p += 3) < end)
|
||||||
|
switch (p[0])
|
||||||
|
{
|
||||||
|
case '>':
|
||||||
|
if (p[-1] == '-' && p[-2] == '-')
|
||||||
|
return p + 1;
|
||||||
|
break;
|
||||||
|
case '-':
|
||||||
|
at_dash:
|
||||||
|
if (p[-1] == '-')
|
||||||
|
{
|
||||||
|
at_dash_dash:
|
||||||
|
if (++p == end) return NULL;
|
||||||
|
switch (p[0])
|
||||||
|
{
|
||||||
|
case '>': return p + 1;
|
||||||
|
case '-': goto at_dash_dash;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ((p += 2) >= end) return NULL;
|
||||||
|
switch (p[0])
|
||||||
|
{
|
||||||
|
case '>':
|
||||||
|
if (p[-1] == '-')
|
||||||
|
return p + 1;
|
||||||
|
break;
|
||||||
|
case '-':
|
||||||
|
goto at_dash;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* Advance P (a char pointer), with the explicit intent of being able
|
/* Advance P (a char pointer), with the explicit intent of being able
|
||||||
to read the next character. If this is not possible, go to finish. */
|
to read the next character. If this is not possible, go to finish. */
|
||||||
@ -638,8 +685,26 @@ map_html_tags (const char *text, int size,
|
|||||||
declaration). */
|
declaration). */
|
||||||
if (*p == '!')
|
if (*p == '!')
|
||||||
{
|
{
|
||||||
/* This is an SGML declaration -- just skip it. */
|
if (!opt.strict_comments
|
||||||
|
&& p < end + 3 && p[1] == '-' && p[2] == '-')
|
||||||
|
{
|
||||||
|
/* If strict comments are not enforced and if we know
|
||||||
|
we're looking at a comment, simply look for the
|
||||||
|
terminating "-->". Non-strict is the default because
|
||||||
|
it works in other browsers and most HTML writers can't
|
||||||
|
be bothered with getting the comments right. */
|
||||||
|
const char *comment_end = find_comment_end (p + 3, end);
|
||||||
|
if (comment_end)
|
||||||
|
p = comment_end;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Either in strict comment mode or looking at a non-empty
|
||||||
|
declaration. Real declarations are much less likely to
|
||||||
|
be misused the way comments are, so advance over them
|
||||||
|
properly regardless of strictness. */
|
||||||
p = advance_declaration (p, end);
|
p = advance_declaration (p, end);
|
||||||
|
}
|
||||||
if (p == end)
|
if (p == end)
|
||||||
goto finish;
|
goto finish;
|
||||||
goto look_for_tag;
|
goto look_for_tag;
|
||||||
|
@ -207,6 +207,7 @@ static struct {
|
|||||||
{ "sslcheckcert", &opt.sslcheckcert, cmd_number },
|
{ "sslcheckcert", &opt.sslcheckcert, cmd_number },
|
||||||
{ "sslprotocol", &opt.sslprotocol, cmd_number },
|
{ "sslprotocol", &opt.sslprotocol, cmd_number },
|
||||||
#endif /* HAVE_SSL */
|
#endif /* HAVE_SSL */
|
||||||
|
{ "strictcomments", &opt.strict_comments, cmd_boolean },
|
||||||
{ "timeout", &opt.timeout, cmd_time },
|
{ "timeout", &opt.timeout, cmd_time },
|
||||||
{ "timestamping", &opt.timestamping, cmd_boolean },
|
{ "timestamping", &opt.timestamping, cmd_boolean },
|
||||||
{ "tries", &opt.ntry, cmd_number_inf },
|
{ "tries", &opt.ntry, cmd_number_inf },
|
||||||
|
@ -239,6 +239,7 @@ Recursive retrieval:\n\
|
|||||||
-K, --backup-converted before converting file X, back up as X.orig.\n\
|
-K, --backup-converted before converting file X, back up as X.orig.\n\
|
||||||
-m, --mirror shortcut option equivalent to -r -N -l inf -nr.\n\
|
-m, --mirror shortcut option equivalent to -r -N -l inf -nr.\n\
|
||||||
-p, --page-requisites get all images, etc. needed to display HTML page.\n\
|
-p, --page-requisites get all images, etc. needed to display HTML page.\n\
|
||||||
|
--strict-comments turn on strict (SGML) handling of HTML comments.\n\
|
||||||
\n"), stdout);
|
\n"), stdout);
|
||||||
fputs (_("\
|
fputs (_("\
|
||||||
Recursive accept/reject:\n\
|
Recursive accept/reject:\n\
|
||||||
@ -302,6 +303,7 @@ main (int argc, char *const *argv)
|
|||||||
{ "server-response", no_argument, NULL, 'S' },
|
{ "server-response", no_argument, NULL, 'S' },
|
||||||
{ "span-hosts", no_argument, NULL, 'H' },
|
{ "span-hosts", no_argument, NULL, 'H' },
|
||||||
{ "spider", no_argument, NULL, 132 },
|
{ "spider", no_argument, NULL, 132 },
|
||||||
|
{ "strict-comments", no_argument, NULL, 177 },
|
||||||
{ "timestamping", no_argument, NULL, 'N' },
|
{ "timestamping", no_argument, NULL, 'N' },
|
||||||
{ "verbose", no_argument, NULL, 'v' },
|
{ "verbose", no_argument, NULL, 'v' },
|
||||||
{ "version", no_argument, NULL, 'V' },
|
{ "version", no_argument, NULL, 'V' },
|
||||||
@ -524,6 +526,9 @@ GNU General Public License for more details.\n"));
|
|||||||
case 174:
|
case 174:
|
||||||
setval ("retryconnrefused", "on");
|
setval ("retryconnrefused", "on");
|
||||||
break;
|
break;
|
||||||
|
case 177:
|
||||||
|
setval ("strictcomments", "on");
|
||||||
|
break;
|
||||||
|
|
||||||
/* Options accepting an argument: */
|
/* Options accepting an argument: */
|
||||||
case 129:
|
case 129:
|
||||||
|
@ -192,6 +192,9 @@ struct options
|
|||||||
int restrict_files_ctrl; /* non-zero if control chars in URLs
|
int restrict_files_ctrl; /* non-zero if control chars in URLs
|
||||||
are restricted from appearing in
|
are restricted from appearing in
|
||||||
generated file names. */
|
generated file names. */
|
||||||
|
|
||||||
|
int strict_comments; /* whether strict SGML comments are
|
||||||
|
enforced. */
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct options opt;
|
extern struct options opt;
|
||||||
|
Loading…
Reference in New Issue
Block a user