1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Add FLAGS argument to map_html_tags.

This commit is contained in:
hniksic 2003-10-08 09:17:33 -07:00
parent a9c3c58c9f
commit ae1d264fcc
4 changed files with 37 additions and 18 deletions

View File

@ -1,3 +1,11 @@
2003-10-08 Hrvoje Niksic <hniksic@xemacs.org>
* html-url.c (get_urls_html): Parse the appropriate flags to
html-parse.c.
* html-parse.c (map_html_tags): Accept FLAGS from the caller
instead of examining OPT.
2003-10-08 Hrvoje Niksic <hniksic@xemacs.org>
* html-url.c (find_tag): Switch to binary search.

View File

@ -129,8 +129,6 @@ so, delete this exception statement from your version. */
# define ISALNUM(x) isalnum (x)
# define TOLOWER(x) tolower (x)
# define TOUPPER(x) toupper (x)
static struct options opt;
#endif /* STANDALONE */
/* Pool support. A pool is a resizable chunk of memory. It is first
@ -692,7 +690,7 @@ static int tag_backout_count;
/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
MAPFUN will be called with two arguments: pointer to an initialized
struct taginfo, and CLOSURE.
struct taginfo, and MAPARG.
ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
be processed by this function. If it is NULL, all the tags are
@ -708,10 +706,10 @@ static int tag_backout_count;
void
map_html_tags (const char *text, int size,
void (*mapfun) (struct taginfo *, void *), void *maparg,
int flags,
const char **allowed_tag_names,
const char **allowed_attribute_names,
void (*mapfun) (struct taginfo *, void *),
void *closure)
const char **allowed_attribute_names)
{
/* storage for strings passed to MAPFUN callback; if 256 bytes is
too little, POOL_APPEND allocates more with malloc. */
@ -756,7 +754,7 @@ map_html_tags (const char *text, int size,
declaration). */
if (*p == '!')
{
if (!opt.strict_comments
if (!(flags & MHT_STRICT_COMMENTS)
&& p < end + 3 && p[1] == '-' && p[2] == '-')
{
/* If strict comments are not enforced and if we know
@ -893,11 +891,9 @@ map_html_tags (const char *text, int size,
goto look_for_tag;
attr_raw_value_end = p; /* <foo bar="baz"> */
/* ^ */
/* The AP_TRIM_BLANKS is there for buggy HTML
generators that generate <a href=" foo"> instead of
<a href="foo"> (Netscape ignores spaces as well.)
If you really mean space, use &32; or %20. */
operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS;
operation = AP_PROCESS_ENTITIES;
if (flags & MHT_TRIM_VALUES)
operation |= AP_TRIM_BLANKS;
}
else
{
@ -985,7 +981,7 @@ map_html_tags (const char *text, int size,
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
/* Ta-dam! */
(*mapfun) (&taginfo, closure);
(*mapfun) (&taginfo, maparg);
ADVANCE (p);
}
goto look_for_tag;

View File

@ -6,7 +6,7 @@ This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -53,7 +53,13 @@ struct taginfo {
const char *end_position; /* end position of tag */
};
void map_html_tags PARAMS ((const char *, int, const char **, const char **,
void (*) (struct taginfo *, void *), void *));
/* Flags for map_html_tags: */
#define MHT_STRICT_COMMENTS 1 /* use strict comment interpretation */
#define MHT_TRIM_VALUES 2 /* trim attribute values, e.g. interpret
<a href=" foo "> as "foo" */
void map_html_tags PARAMS ((const char *, int,
void (*) (struct taginfo *, void *), void *,
int, const char **, const char **));
#endif /* HTML_PARSE_H */

View File

@ -643,6 +643,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
{
struct file_memory *fm;
struct map_context ctx;
int flags;
/* Load the file. */
fm = read_file (file);
@ -663,8 +664,16 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
if (!interesting_tags)
init_interesting ();
map_html_tags (fm->content, fm->length, interesting_tags,
interesting_attributes, collect_tags_mapper, &ctx);
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
generate <a href=" foo"> instead of <a href="foo"> (Netscape
ignores spaces as well.) If you really mean space, use &32; or
%20. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
interesting_tags, interesting_attributes);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)