1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Break up collect_tags_mapper into several functions.

Published in <sxsg06gla1h.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-12-12 07:43:01 -08:00
parent 84e9851688
commit 8817f4c1a4
2 changed files with 273 additions and 245 deletions

View File

@ -1,3 +1,8 @@
2001-12-12 Hrvoje Niksic <hniksic@arsdigita.com>
* html-url.c (collect_tags_mapper): Break into several functions.
(tag_url_attributes): Collect <embed href=...>.
2001-12-11 Hrvoje Niksic <hniksic@arsdigita.com> 2001-12-11 Hrvoje Niksic <hniksic@arsdigita.com>
* host.c: New type ipv4_address. Use it consistently instead of * host.c: New type ipv4_address. Use it consistently instead of

View File

@ -38,79 +38,89 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
extern int errno; extern int errno;
#endif #endif
enum tag_category { TC_LINK, TC_SPEC }; struct map_context;
/* Here we try to categorize the known tags. Each tag has its ID and typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
cetegory. Category TC_LINK means that one or more of its struct map_context *));
attributes contain links that should be retrieved. TC_SPEC means
that the tag is specific in some way, and has to be handled #define DECLARE_TAG_HANDLER(fun) \
specially. */ static void fun PARAMS ((int, struct taginfo *, struct map_context *))
DECLARE_TAG_HANDLER (tag_find_urls);
DECLARE_TAG_HANDLER (tag_handle_base);
DECLARE_TAG_HANDLER (tag_handle_link);
DECLARE_TAG_HANDLER (tag_handle_meta);
/* The list of known tags and functions used for handling them. Most
tags are simply harvested for URLs. */
static struct { static struct {
const char *name; const char *name;
enum tag_category category; tag_handler_t handler;
} known_tags[] = { } known_tags[] = {
#define TAG_A 0 #define TAG_A 0
{ "a", TC_LINK }, { "a", tag_find_urls },
#define TAG_APPLET 1 #define TAG_APPLET 1
{ "applet", TC_LINK }, { "applet", tag_find_urls },
#define TAG_AREA 2 #define TAG_AREA 2
{ "area", TC_LINK }, { "area", tag_find_urls },
#define TAG_BASE 3 #define TAG_BASE 3
{ "base", TC_SPEC }, { "base", tag_handle_base },
#define TAG_BGSOUND 4 #define TAG_BGSOUND 4
{ "bgsound", TC_LINK }, { "bgsound", tag_find_urls },
#define TAG_BODY 5 #define TAG_BODY 5
{ "body", TC_LINK }, { "body", tag_find_urls },
#define TAG_EMBED 6 #define TAG_EMBED 6
{ "embed", TC_LINK }, { "embed", tag_find_urls },
#define TAG_FIG 7 #define TAG_FIG 7
{ "fig", TC_LINK }, { "fig", tag_find_urls },
#define TAG_FRAME 8 #define TAG_FRAME 8
{ "frame", TC_LINK }, { "frame", tag_find_urls },
#define TAG_IFRAME 9 #define TAG_IFRAME 9
{ "iframe", TC_LINK }, { "iframe", tag_find_urls },
#define TAG_IMG 10 #define TAG_IMG 10
{ "img", TC_LINK }, { "img", tag_find_urls },
#define TAG_INPUT 11 #define TAG_INPUT 11
{ "input", TC_LINK }, { "input", tag_find_urls },
#define TAG_LAYER 12 #define TAG_LAYER 12
{ "layer", TC_LINK }, { "layer", tag_find_urls },
#define TAG_LINK 13 #define TAG_LINK 13
{ "link", TC_SPEC }, { "link", tag_handle_link },
#define TAG_META 14 #define TAG_META 14
{ "meta", TC_SPEC }, { "meta", tag_handle_meta },
#define TAG_OVERLAY 15 #define TAG_OVERLAY 15
{ "overlay", TC_LINK }, { "overlay", tag_find_urls },
#define TAG_SCRIPT 16 #define TAG_SCRIPT 16
{ "script", TC_LINK }, { "script", tag_find_urls },
#define TAG_TABLE 17 #define TAG_TABLE 17
{ "table", TC_LINK }, { "table", tag_find_urls },
#define TAG_TD 18 #define TAG_TD 18
{ "td", TC_LINK }, { "td", tag_find_urls },
#define TAG_TH 19 #define TAG_TH 19
{ "th", TC_LINK } { "th", tag_find_urls }
}; };
/* tag_url_attributes documents which attributes of which tags contain
URLs to harvest. It is used by tag_find_urls. */
/* Flags for specific url-attr pairs handled through TC_LINK: */ /* Defines for the FLAGS field; currently only one flag is defined. */
/* This tag points to an external document not necessary for rendering this /* This tag points to an external document not necessary for rendering this
document (i.e. it's not an inlined image, stylesheet, etc.). */ document (i.e. it's not an inlined image, stylesheet, etc.). */
#define AF_EXTERNAL 1 #define TUA_EXTERNAL 1
/* For tags handled by tag_find_urls: attributes that contain URLs to
/* For tags handled by TC_LINK: attributes that contain URLs to
download. */ download. */
static struct { static struct {
int tagid; int tagid;
const char *attr_name; const char *attr_name;
int flags; int flags;
} url_tag_attr_map[] = { } tag_url_attributes[] = {
{ TAG_A, "href", AF_EXTERNAL }, { TAG_A, "href", TUA_EXTERNAL },
{ TAG_APPLET, "code", 0 }, { TAG_APPLET, "code", 0 },
{ TAG_AREA, "href", AF_EXTERNAL }, { TAG_AREA, "href", TUA_EXTERNAL },
{ TAG_BGSOUND, "src", 0 }, { TAG_BGSOUND, "src", 0 },
{ TAG_BODY, "background", 0 }, { TAG_BODY, "background", 0 },
{ TAG_EMBED, "href", 0 },
{ TAG_EMBED, "src", 0 }, { TAG_EMBED, "src", 0 },
{ TAG_FIG, "src", 0 }, { TAG_FIG, "src", 0 },
{ TAG_FRAME, "src", 0 }, { TAG_FRAME, "src", 0 },
@ -150,7 +160,10 @@ init_interesting (void)
Here we also make sure that what we put in interesting_tags Here we also make sure that what we put in interesting_tags
matches the user's preferences as specified through --ignore-tags matches the user's preferences as specified through --ignore-tags
and --follow-tags. */ and --follow-tags.
This function is as large as this only because of the glorious
expressivity of the C programming language. */
{ {
int i, ind = 0; int i, ind = 0;
@ -209,7 +222,7 @@ init_interesting (void)
interesting_tags[ind] = NULL; interesting_tags[ind] = NULL;
} }
/* The same for attributes, except we loop through url_tag_attr_map. /* The same for attributes, except we loop through tag_url_attributes.
Here we also need to make sure that the list of attributes is Here we also need to make sure that the list of attributes is
unique, and to include the attributes from additional_attributes. */ unique, and to include the attributes from additional_attributes. */
{ {
@ -221,10 +234,10 @@ init_interesting (void)
att[i] = additional_attributes[i]; att[i] = additional_attributes[i];
ind = i; ind = i;
att[ind] = NULL; att[ind] = NULL;
for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++) for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
{ {
int j, seen = 0; int j, seen = 0;
const char *look_for = url_tag_attr_map[i].attr_name; const char *look_for = tag_url_attributes[i].attr_name;
for (j = 0; j < ind - 1; j++) for (j = 0; j < ind - 1; j++)
if (!strcmp (att[j], look_for)) if (!strcmp (att[j], look_for))
{ {
@ -264,49 +277,54 @@ find_tag (const char *tag_name)
} }
/* Find the value of attribute named NAME in the taginfo TAG. If the /* Find the value of attribute named NAME in the taginfo TAG. If the
attribute is not present, return NULL. If ATTRID is non-NULL, the attribute is not present, return NULL. If ATTRIND is non-NULL, the
exact identity of the attribute will be returned. */ index of the attribute in TAG will be stored there. */
static char * static char *
find_attr (struct taginfo *tag, const char *name, int *attrid) find_attr (struct taginfo *tag, const char *name, int *attrind)
{ {
int i; int i;
for (i = 0; i < tag->nattrs; i++) for (i = 0; i < tag->nattrs; i++)
if (!strcasecmp (tag->attrs[i].name, name)) if (!strcasecmp (tag->attrs[i].name, name))
{ {
if (attrid) if (attrind)
*attrid = i; *attrind = i;
return tag->attrs[i].value; return tag->attrs[i].value;
} }
return NULL; return NULL;
} }
struct collect_urls_closure { struct map_context {
char *text; /* HTML text. */ char *text; /* HTML text. */
char *base; /* Base URI of the document, possibly char *base; /* Base URI of the document, possibly
changed through <base href=...>. */ changed through <base href=...>. */
struct urlpos *head, *tail; /* List of URLs */
const char *parent_base; /* Base of the current document. */ const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */ const char *document_file; /* File name of this document. */
int nofollow; /* whether NOFOLLOW was specified in a int nofollow; /* whether NOFOLLOW was specified in a
<meta name=robots> tag. */ <meta name=robots> tag. */
struct urlpos *head, *tail; /* List of URLs that is being
built. */
}; };
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID /* Append LINK_URI to the urlpos structure that is being built.
are the necessary context to store the position and size. */
LINK_URI will be merged with the current document base. TAG and
ATTRIND are the necessary context to store the position and
size. */
static struct urlpos * static struct urlpos *
handle_link (struct collect_urls_closure *closure, const char *link_uri, append_one_url (const char *link_uri, int inlinep,
struct taginfo *tag, int attrid) struct taginfo *tag, int attrind, struct map_context *ctx)
{ {
int link_has_scheme = url_has_scheme (link_uri); int link_has_scheme = url_has_scheme (link_uri);
struct urlpos *newel; struct urlpos *newel;
const char *base = closure->base ? closure->base : closure->parent_base; const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url; struct url *url;
if (!base) if (!base)
{ {
DEBUGP (("%s: no base, merge will use \"%s\".\n", DEBUGP (("%s: no base, merge will use \"%s\".\n",
closure->document_file, link_uri)); ctx->document_file, link_uri));
if (!link_has_scheme) if (!link_has_scheme)
{ {
@ -320,7 +338,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
if (!url) if (!url)
{ {
DEBUGP (("%s: link \"%s\" doesn't parse.\n", DEBUGP (("%s: link \"%s\" doesn't parse.\n",
closure->document_file, link_uri)); ctx->document_file, link_uri));
return NULL; return NULL;
} }
} }
@ -333,13 +351,13 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
char *complete_uri = uri_merge (base, link_uri); char *complete_uri = uri_merge (base, link_uri);
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
closure->document_file, base, link_uri, complete_uri)); ctx->document_file, base, link_uri, complete_uri));
url = url_parse (complete_uri, NULL); url = url_parse (complete_uri, NULL);
if (!url) if (!url)
{ {
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
closure->document_file, complete_uri)); ctx->document_file, complete_uri));
xfree (complete_uri); xfree (complete_uri);
return NULL; return NULL;
} }
@ -347,12 +365,13 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
} }
newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
memset (newel, 0, sizeof (*newel)); memset (newel, 0, sizeof (*newel));
newel->next = NULL; newel->next = NULL;
newel->url = url; newel->url = url;
newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
newel->size = tag->attrs[attrid].value_raw_size; newel->size = tag->attrs[attrind].value_raw_size;
newel->link_inline_p = inlinep;
/* A URL is relative if the host is not named, and the name does not /* A URL is relative if the host is not named, and the name does not
start with `/'. */ start with `/'. */
@ -361,135 +380,131 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
else if (link_has_scheme) else if (link_has_scheme)
newel->link_complete_p = 1; newel->link_complete_p = 1;
if (closure->tail) if (ctx->tail)
{ {
closure->tail->next = newel; ctx->tail->next = newel;
closure->tail = newel; ctx->tail = newel;
} }
else else
closure->tail = closure->head = newel; ctx->tail = ctx->head = newel;
return newel; return newel;
} }
/* All the tag_* functions are called from collect_tags_mapper, as
specified by KNOWN_TAGS. */
/* Examine name and attributes of TAG and take appropriate action. /* For most tags, all we want to do is harvest URLs from their
What will be done depends on TAG's category and attribute values. attributes. */
Tags of TC_LINK category have attributes that contain links to
follow; tags of TC_SPEC category need to be handled specially.
#### It would be nice to split this into several functions. */
static void static void
collect_tags_mapper (struct taginfo *tag, void *arg) tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{ {
struct collect_urls_closure *closure = (struct collect_urls_closure *)arg; int i, attrind, first = -1;
int tagid = find_tag (tag->name); int size = ARRAY_SIZE (tag_url_attributes);
assert (tagid != -1);
switch (known_tags[tagid].category)
{
case TC_LINK:
{
int i, id, first;
int size = ARRAY_SIZE (url_tag_attr_map);
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
if (url_tag_attr_map[i].tagid == tagid) if (tag_url_attributes[i].tagid == tagid)
break; {
/* We've found the index of url_tag_attr_map where the /* We've found the index of tag_url_attributes where the
attributes of our tags begin. Now, look for every one of attributes of our tags begin. */
them, and handle it. */
/* Need to process the attributes in the order they appear in
the tag, as this is required if we convert links. */
first = i; first = i;
for (id = 0; id < tag->nattrs; id++)
{
/* This nested loop may seem inefficient (O(n^2)), but it's
not, since the number of attributes (n) we loop over is
extremely small. In the worst case of IMG with all its
possible attributes, n^2 will be only 9. */
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
i++)
{
if (0 == strcasecmp (tag->attrs[id].name,
url_tag_attr_map[i].attr_name))
{
char *attr_value = tag->attrs[id].value;
if (attr_value)
{
struct urlpos *entry;
entry = handle_link (closure, attr_value, tag, id);
if (entry != NULL
&& !(url_tag_attr_map[i].flags & AF_EXTERNAL))
entry->link_inline_p = 1;
}
}
}
}
}
break;
case TC_SPEC:
switch (tagid)
{
case TAG_BASE:
{
struct urlpos *base_urlpos;
int id;
char *newbase = find_attr (tag, "href", &id);
if (!newbase)
break; break;
}
assert (first != -1);
base_urlpos = handle_link (closure, newbase, tag, id); /* Loop over the "interesting" attributes of this tag. In this
example, it will loop over "src" and "lowsrc".
<img src="foo.png" lowsrc="bar.png">
This has to be done in the outer loop so that the attributes are
processed in the same order in which they appear in the page.
This is required when converting links. */
for (attrind = 0; attrind < tag->nattrs; attrind++)
{
/* Find whether TAG/ATTRIND is a combination that contains a
URL. */
char *attrvalue = tag->attrs[attrind].value;
/* If you're cringing at the inefficiency of the nested loops,
remember that the number of attributes the inner loop
iterates over is laughably small -- three in the worst case
(IMG). */
for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
{
if (0 == strcasecmp (tag->attrs[attrind].name,
tag_url_attributes[i].attr_name))
{
int flags = tag_url_attributes[i].flags;
append_one_url (attrvalue, !(flags & TUA_EXTERNAL),
tag, attrind, ctx);
}
}
}
}
static void
tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
{
struct urlpos *base_urlpos;
int attrind;
char *newbase = find_attr (tag, "href", &attrind);
if (!newbase)
return;
base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
if (!base_urlpos) if (!base_urlpos)
break; return;
base_urlpos->ignore_when_downloading = 1; base_urlpos->ignore_when_downloading = 1;
base_urlpos->link_base_p = 1; base_urlpos->link_base_p = 1;
if (closure->base) if (ctx->base)
xfree (closure->base); xfree (ctx->base);
if (closure->parent_base) if (ctx->parent_base)
closure->base = uri_merge (closure->parent_base, newbase); ctx->base = uri_merge (ctx->parent_base, newbase);
else else
closure->base = xstrdup (newbase); ctx->base = xstrdup (newbase);
} }
break;
case TAG_LINK: static void
{ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
int id; {
char *href = find_attr (tag, "href", &id); int attrind;
char *href = find_attr (tag, "href", &attrind);
/* All <link href="..."> link references are external, /* All <link href="..."> link references are external,
except for <link rel="stylesheet" href="...">. */ except for <link rel="stylesheet" href="...">. */
if (href) if (href)
{
struct urlpos *entry;
entry = handle_link (closure, href, tag, id);
if (entry != NULL)
{ {
char *rel = find_attr (tag, "rel", NULL); char *rel = find_attr (tag, "rel", NULL);
if (rel && 0 == strcasecmp (rel, "stylesheet")) int inlinep = (rel && 0 == strcasecmp (rel, "stylesheet"));
entry->link_inline_p = 1; append_one_url (href, inlinep, tag, attrind, ctx);
} }
} }
}
break; /* Some pages use a META tag to specify that the page be refreshed by
case TAG_META: a new page after a given number of seconds. The general format for
/* Some pages use a META tag to specify that the page be this is:
refreshed by a new page after a given number of seconds.
The general format for this is:
<meta http-equiv=Refresh content="NUMBER; URL=index2.html"> <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
So we just need to skip past the "NUMBER; URL=" garbage So we just need to skip past the "NUMBER; URL=" garbage to get to
to get to the URL. */ the URL. */
{
static void
tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
{
char *name = find_attr (tag, "name", NULL); char *name = find_attr (tag, "name", NULL);
char *http_equiv = find_attr (tag, "http-equiv", NULL); char *http_equiv = find_attr (tag, "http-equiv", NULL);
if (http_equiv && !strcasecmp (http_equiv, "refresh"))
if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
{ {
struct urlpos *entry; struct urlpos *entry;
int id; int attrind;
char *p, *refresh = find_attr (tag, "content", &id); char *p, *refresh = find_attr (tag, "content", &attrind);
int timeout = 0; int timeout = 0;
for (p = refresh; ISDIGIT (*p); p++) for (p = refresh; ISDIGIT (*p); p++)
@ -499,7 +514,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
while (ISSPACE (*p)) while (ISSPACE (*p))
++p; ++p;
if (!(TOUPPER (*p) == 'U' if (!( TOUPPER (*p) == 'U'
&& TOUPPER (*(p + 1)) == 'R' && TOUPPER (*(p + 1)) == 'R'
&& TOUPPER (*(p + 2)) == 'L' && TOUPPER (*(p + 2)) == 'L'
&& *(p + 3) == '=')) && *(p + 3) == '='))
@ -508,14 +523,14 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
while (ISSPACE (*p)) while (ISSPACE (*p))
++p; ++p;
entry = handle_link (closure, p, tag, id); entry = append_one_url (p, 0, tag, attrind, ctx);
if (entry) if (entry)
{ {
entry->link_refresh_p = 1; entry->link_refresh_p = 1;
entry->refresh_timeout = timeout; entry->refresh_timeout = timeout;
} }
} }
else if (name && !strcasecmp (name, "robots")) else if (name && 0 == strcasecmp (name, "robots"))
{ {
/* Handle stuff like: /* Handle stuff like:
<meta name="robots" content="index,nofollow"> */ <meta name="robots" content="index,nofollow"> */
@ -523,7 +538,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
if (!content) if (!content)
return; return;
if (!strcasecmp (content, "none")) if (!strcasecmp (content, "none"))
closure->nofollow = 1; ctx->nofollow = 1;
else else
{ {
while (*content) while (*content)
@ -536,22 +551,30 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
else else
end = content + strlen (content); end = content + strlen (content);
if (!strncasecmp (content, "nofollow", end - content)) if (!strncasecmp (content, "nofollow", end - content))
closure->nofollow = 1; ctx->nofollow = 1;
content = end; content = end;
} }
} }
} }
}
break;
default:
/* Category is TC_SPEC, but tag name is unhandled. This
must not be. */
abort ();
}
break;
}
} }
/* Examine name and attributes of TAG and take appropriate action
according to the tag. */
static void
collect_tags_mapper (struct taginfo *tag, void *arg)
{
struct map_context *ctx = (struct map_context *)arg;
int tagid;
tag_handler_t handler;
tagid = find_tag (tag->name);
assert (tagid != -1);
handler = known_tags[tagid].handler;
handler (tagid, tag, ctx);
}
/* Analyze HTML tags FILE and construct a list of URLs referenced from /* Analyze HTML tags FILE and construct a list of URLs referenced from
it. It merges relative links in FILE with URL. It is aware of it. It merges relative links in FILE with URL. It is aware of
<base href=...> and does the right thing. */ <base href=...> and does the right thing. */
@ -559,7 +582,7 @@ struct urlpos *
get_urls_html (const char *file, const char *url, int *meta_disallow_follow) get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
{ {
struct file_memory *fm; struct file_memory *fm;
struct collect_urls_closure closure; struct map_context ctx;
/* Load the file. */ /* Load the file. */
fm = read_file (file); fm = read_file (file);
@ -570,26 +593,26 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
} }
DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
closure.text = fm->content; ctx.text = fm->content;
closure.head = closure.tail = NULL; ctx.head = ctx.tail = NULL;
closure.base = NULL; ctx.base = NULL;
closure.parent_base = url ? url : opt.base_href; ctx.parent_base = url ? url : opt.base_href;
closure.document_file = file; ctx.document_file = file;
closure.nofollow = 0; ctx.nofollow = 0;
if (!interesting_tags) if (!interesting_tags)
init_interesting (); init_interesting ();
map_html_tags (fm->content, fm->length, interesting_tags, map_html_tags (fm->content, fm->length, interesting_tags,
interesting_attributes, collect_tags_mapper, &closure); interesting_attributes, collect_tags_mapper, &ctx);
DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow)); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow) if (meta_disallow_follow)
*meta_disallow_follow = closure.nofollow; *meta_disallow_follow = ctx.nofollow;
FREE_MAYBE (closure.base); FREE_MAYBE (ctx.base);
read_file_free (fm); read_file_free (fm);
return closure.head; return ctx.head;
} }
void void