From a4db28e20f709969144cf9b250ad006138ae6a44 Mon Sep 17 00:00:00 2001 From: hniksic Date: Fri, 30 Nov 2001 13:17:53 -0800 Subject: [PATCH] [svn] Ignore -np when in -p mode. Published in . --- TODO | 6 ----- src/ChangeLog | 13 +++++++++++ src/html-url.c | 60 +++++++++++++++++++------------------------------- src/recur.c | 16 +++++++++----- src/retr.c | 2 +- src/url.h | 19 ++++++++-------- 6 files changed, 57 insertions(+), 59 deletions(-) diff --git a/TODO b/TODO index 4589bab9..84c796a3 100644 --- a/TODO +++ b/TODO @@ -17,12 +17,6 @@ changes. * -p should probably go "_two_ more hops" on pages. -* Only normal link-following recursion should respect -np. Page-requisite - recursion should not. When -np -p is specified, Wget should still retrieve - requisite images and such on the server, even if they aren't in that directory - or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files - from other hosts. - * Add a --range parameter allowing you to explicitly specify a range of bytes to get from a file over HTTP (FTP only supports ranges ending at the end of the file, though forcibly disconnecting from the server at the desired endpoint diff --git a/src/ChangeLog b/src/ChangeLog index ed710030..f480009a 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,16 @@ +2001-11-30 Hrvoje Niksic + + * recur.c (retrieve_tree): Skip the non-inline entries when + enqueuing the children of a leaf HTML node in -p mode. + (descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is + "inline". + + * html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML. + (collect_tags_mapper): When an entry is "inline", mark it as such. + + * recur.c (descend_url_p): Fix test when checking for + acceptance/rejection rules. + 2001-10-31 Daniel BODEA * netrc.c (search_netrc): When slack_default is 0, still look for diff --git a/src/html-url.c b/src/html-url.c index 051f5057..5942a49f 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -287,9 +287,6 @@ struct collect_urls_closure { struct urlpos *head, *tail; /* List of URLs */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int dash_p_leaf_HTML; /* Whether -p is specified, and this - document is the "leaf" node of the - HTML tree. */ int nofollow; /* whether NOFOLLOW was specified in a tag. */ }; @@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg) for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid); i++) { - char *attr_value; - if (closure->dash_p_leaf_HTML - && (url_tag_attr_map[i].flags & AF_EXTERNAL)) - /* If we're at a -p leaf node, we don't want to retrieve - links to references we know are external to this document, - such as . */ - continue; - - if (!strcasecmp (tag->attrs[id].name, - url_tag_attr_map[i].attr_name)) + if (0 == strcasecmp (tag->attrs[id].name, + url_tag_attr_map[i].attr_name)) { - attr_value = tag->attrs[id].value; + char *attr_value = tag->attrs[id].value; if (attr_value) - handle_link (closure, attr_value, tag, id); + { + struct urlpos *entry; + entry = handle_link (closure, attr_value, tag, id); + if (entry != NULL + && !(url_tag_attr_map[i].flags & AF_EXTERNAL)) + entry->link_inline_p = 1; + } } } } @@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg) case TAG_LINK: { int id; - char *rel = find_attr (tag, "rel", NULL); char *href = find_attr (tag, "href", &id); + + /* All link references are external, + except for . */ if (href) { - /* In the normal case, all tags are - fair game. - - In the special case of when -p is active, however, - and we're at a leaf node (relative to the -l - max. depth) in the HTML document tree, the only - tag we'll follow is a , as it'll be necessary for displaying - this document properly. We won't follow other - tags, like , for instance, - as they refer to external documents. */ - if (!closure->dash_p_leaf_HTML - || (rel && !strcasecmp (rel, "stylesheet"))) - handle_link (closure, href, tag, id); + struct urlpos *entry; + entry = handle_link (closure, href, tag, id); + if (entry != NULL) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel && 0 == strcasecmp (rel, "stylesheet")) + entry->link_inline_p = 1; + } } } break; @@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg) /* Analyze HTML tags FILE and construct a list of URLs referenced from it. It merges relative links in FILE with URL. It is aware of - and does the right thing. - - If dash_p_leaf_HTML is non-zero, only the elements needed to render - FILE ("non-external" links) will be returned. */ + and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML, - int *meta_disallow_follow) +get_urls_html (const char *file, const char *url, int *meta_disallow_follow) { struct file_memory *fm; struct collect_urls_closure closure; @@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML, closure.base = NULL; closure.parent_base = url ? url : opt.base_href; closure.document_file = file; - closure.dash_p_leaf_HTML = dash_p_leaf_HTML; closure.nofollow = 0; if (!interesting_tags) diff --git a/src/recur.c b/src/recur.c index 0aa96498..6b8c41b0 100644 --- a/src/recur.c +++ b/src/recur.c @@ -279,8 +279,8 @@ retrieve_tree (const char *start_url) if (descend) { int meta_disallow_follow = 0; - struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML, - &meta_disallow_follow); + struct urlpos *children + = get_urls_html (file, url, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { @@ -298,6 +298,8 @@ retrieve_tree (const char *start_url) { if (child->ignore_when_downloading) continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; if (descend_url_p (child, url_parsed, depth, start_url_parsed, blacklist)) { @@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, /* 4. Check for parent directory. If we descended to a different host or changed the scheme, ignore - opt.no_parent. Also ignore it for -p leaf retrievals. */ + opt.no_parent. Also ignore it for documents needed to display + the parent page when in -p mode. */ if (opt.no_parent && u->scheme == start_url_parsed->scheme && 0 == strcasecmp (u->host, start_url_parsed->host) - && u->port == start_url_parsed->port) + && u->port == start_url_parsed->port + && !(opt.page_requisites && upos->link_inline_p)) { if (!frontcmp (start_url_parsed->dir, u->dir)) { @@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth, if (u->file[0] != '\0' && ((suf = suffix (url)) == NULL || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm")) - || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel))) + || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel))) { if (!acceptable (u->file)) { @@ -674,7 +678,7 @@ convert_all_links (void) DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, NULL); + urls = get_urls_html (html->string, url, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the diff --git a/src/retr.c b/src/retr.c index 85822679..6c12462a 100644 --- a/src/retr.c +++ b/src/retr.c @@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count) uerr_t status; struct urlpos *url_list, *cur_url; - url_list = (html ? get_urls_html (file, NULL, FALSE, NULL) + url_list = (html ? get_urls_html (file, NULL, NULL) : get_urls_file (file)); status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ diff --git a/src/url.h b/src/url.h index 836cdad1..3c42c5aa 100644 --- a/src/url.h +++ b/src/url.h @@ -79,16 +79,17 @@ struct urlpos { char *local_name; /* local file to which it was saved (used by convert_links) */ - int ignore_when_downloading; /* reserved for special links such as - which are used - when converting links, but ignored - when downloading. */ + /* reserved for special links such as which are + used when converting links, but ignored when downloading. */ + unsigned int ignore_when_downloading :1; /* Information about the original link: */ - int link_relative_p; /* was the link relative? */ - int link_complete_p; /* was the link complete (with the - host name, etc.) */ - int link_base_p; /* was the link */ + + unsigned int link_relative_p :1; /* was the link relative? */ + unsigned int link_complete_p :1; /* was the link complete (with the + host name, etc.) */ + unsigned int link_base_p :1; /* was the link */ + unsigned int link_inline_p :1; /* needed to render the page. */ /* Conversion requirements: */ enum convert_options convert; /* is conversion required? */ @@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *)); char *url_string PARAMS ((const struct url *, int)); struct urlpos *get_urls_file PARAMS ((const char *)); -struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *)); +struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *)); void free_urlpos PARAMS ((struct urlpos *)); char *uri_merge PARAMS ((const char *, const char *));