From ae1d264fcc190f9c74cb490aa6da0240b0b77b1e Mon Sep 17 00:00:00 2001 From: hniksic Date: Wed, 8 Oct 2003 09:17:33 -0700 Subject: [PATCH] [svn] Add FLAGS argument to map_html_tags. --- src/ChangeLog | 8 ++++++++ src/html-parse.c | 22 +++++++++------------- src/html-parse.h | 12 +++++++++--- src/html-url.c | 13 +++++++++++-- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index baa4c581..2e13ab9c 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2003-10-08 Hrvoje Niksic + + * html-url.c (get_urls_html): Parse the appropriate flags to + html-parse.c. + + * html-parse.c (map_html_tags): Accept FLAGS from the caller + instead of examining OPT. + 2003-10-08 Hrvoje Niksic * html-url.c (find_tag): Switch to binary search. diff --git a/src/html-parse.c b/src/html-parse.c index a5fab446..7eaf0c91 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -129,8 +129,6 @@ so, delete this exception statement from your version. */ # define ISALNUM(x) isalnum (x) # define TOLOWER(x) tolower (x) # define TOUPPER(x) toupper (x) - -static struct options opt; #endif /* STANDALONE */ /* Pool support. A pool is a resizable chunk of memory. It is first @@ -692,7 +690,7 @@ static int tag_backout_count; /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long. MAPFUN will be called with two arguments: pointer to an initialized - struct taginfo, and CLOSURE. + struct taginfo, and MAPARG. ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to be processed by this function. If it is NULL, all the tags are @@ -708,10 +706,10 @@ static int tag_backout_count; void map_html_tags (const char *text, int size, + void (*mapfun) (struct taginfo *, void *), void *maparg, + int flags, const char **allowed_tag_names, - const char **allowed_attribute_names, - void (*mapfun) (struct taginfo *, void *), - void *closure) + const char **allowed_attribute_names) { /* storage for strings passed to MAPFUN callback; if 256 bytes is too little, POOL_APPEND allocates more with malloc. */ @@ -756,7 +754,7 @@ map_html_tags (const char *text, int size, declaration). */ if (*p == '!') { - if (!opt.strict_comments + if (!(flags & MHT_STRICT_COMMENTS) && p < end + 3 && p[1] == '-' && p[2] == '-') { /* If strict comments are not enforced and if we know @@ -893,11 +891,9 @@ map_html_tags (const char *text, int size, goto look_for_tag; attr_raw_value_end = p; /* */ /* ^ */ - /* The AP_TRIM_BLANKS is there for buggy HTML - generators that generate instead of - (Netscape ignores spaces as well.) - If you really mean space, use &32; or %20. */ - operation = AP_PROCESS_ENTITIES | AP_TRIM_BLANKS; + operation = AP_PROCESS_ENTITIES; + if (flags & MHT_TRIM_VALUES) + operation |= AP_TRIM_BLANKS; } else { @@ -985,7 +981,7 @@ map_html_tags (const char *text, int size, taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; /* Ta-dam! */ - (*mapfun) (&taginfo, closure); + (*mapfun) (&taginfo, maparg); ADVANCE (p); } goto look_for_tag; diff --git a/src/html-parse.h b/src/html-parse.h index 7d503bda..31dbf38f 100644 --- a/src/html-parse.h +++ b/src/html-parse.h @@ -6,7 +6,7 @@ This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -53,7 +53,13 @@ struct taginfo { const char *end_position; /* end position of tag */ }; -void map_html_tags PARAMS ((const char *, int, const char **, const char **, - void (*) (struct taginfo *, void *), void *)); +/* Flags for map_html_tags: */ +#define MHT_STRICT_COMMENTS 1 /* use strict comment interpretation */ +#define MHT_TRIM_VALUES 2 /* trim attribute values, e.g. interpret + as "foo" */ + +void map_html_tags PARAMS ((const char *, int, + void (*) (struct taginfo *, void *), void *, + int, const char **, const char **)); #endif /* HTML_PARSE_H */ diff --git a/src/html-url.c b/src/html-url.c index abaa2f8e..09962edd 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -643,6 +643,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) { struct file_memory *fm; struct map_context ctx; + int flags; /* Load the file. */ fm = read_file (file); @@ -663,8 +664,16 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) if (!interesting_tags) init_interesting (); - map_html_tags (fm->content, fm->length, interesting_tags, - interesting_attributes, collect_tags_mapper, &ctx); + /* Specify MHT_TRIM_VALUES because of buggy HTML generators that + generate instead of (Netscape + ignores spaces as well.) If you really mean space, use &32; or + %20. */ + flags = MHT_TRIM_VALUES; + if (opt.strict_comments) + flags |= MHT_STRICT_COMMENTS; + + map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, + interesting_tags, interesting_attributes); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow)