diff --git a/TODO b/TODO
index 4589bab9..84c796a3 100644
--- a/TODO
+++ b/TODO
@@ -17,12 +17,6 @@ changes.
* -p should probably go "_two_ more hops" on
pages.
-* Only normal link-following recursion should respect -np. Page-requisite
- recursion should not. When -np -p is specified, Wget should still retrieve
- requisite images and such on the server, even if they aren't in that directory
- or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files
- from other hosts.
-
* Add a --range parameter allowing you to explicitly specify a range of bytes to
get from a file over HTTP (FTP only supports ranges ending at the end of the
file, though forcibly disconnecting from the server at the desired endpoint
diff --git a/src/ChangeLog b/src/ChangeLog
index ed710030..f480009a 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,16 @@
+2001-11-30 Hrvoje Niksic
+
+ * recur.c (retrieve_tree): Skip the non-inline entries when
+ enqueuing the children of a leaf HTML node in -p mode.
+ (descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
+ "inline".
+
+ * html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
+ (collect_tags_mapper): When an entry is "inline", mark it as such.
+
+ * recur.c (descend_url_p): Fix test when checking for
+ acceptance/rejection rules.
+
2001-10-31 Daniel BODEA
* netrc.c (search_netrc): When slack_default is 0, still look for
diff --git a/src/html-url.c b/src/html-url.c
index 051f5057..5942a49f 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -287,9 +287,6 @@ struct collect_urls_closure {
struct urlpos *head, *tail; /* List of URLs */
const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */
- int dash_p_leaf_HTML; /* Whether -p is specified, and this
- document is the "leaf" node of the
- HTML tree. */
int nofollow; /* whether NOFOLLOW was specified in a
tag. */
};
@@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
i++)
{
- char *attr_value;
- if (closure->dash_p_leaf_HTML
- && (url_tag_attr_map[i].flags & AF_EXTERNAL))
- /* If we're at a -p leaf node, we don't want to retrieve
- links to references we know are external to this document,
- such as . */
- continue;
-
- if (!strcasecmp (tag->attrs[id].name,
- url_tag_attr_map[i].attr_name))
+ if (0 == strcasecmp (tag->attrs[id].name,
+ url_tag_attr_map[i].attr_name))
{
- attr_value = tag->attrs[id].value;
+ char *attr_value = tag->attrs[id].value;
if (attr_value)
- handle_link (closure, attr_value, tag, id);
+ {
+ struct urlpos *entry;
+ entry = handle_link (closure, attr_value, tag, id);
+ if (entry != NULL
+ && !(url_tag_attr_map[i].flags & AF_EXTERNAL))
+ entry->link_inline_p = 1;
+ }
}
}
}
@@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
case TAG_LINK:
{
int id;
- char *rel = find_attr (tag, "rel", NULL);
char *href = find_attr (tag, "href", &id);
+
+ /* All link references are external,
+ except for . */
if (href)
{
- /* In the normal case, all tags are
- fair game.
-
- In the special case of when -p is active, however,
- and we're at a leaf node (relative to the -l
- max. depth) in the HTML document tree, the only
- tag we'll follow is a , as it'll be necessary for displaying
- this document properly. We won't follow other
- tags, like , for instance,
- as they refer to external documents. */
- if (!closure->dash_p_leaf_HTML
- || (rel && !strcasecmp (rel, "stylesheet")))
- handle_link (closure, href, tag, id);
+ struct urlpos *entry;
+ entry = handle_link (closure, href, tag, id);
+ if (entry != NULL)
+ {
+ char *rel = find_attr (tag, "rel", NULL);
+ if (rel && 0 == strcasecmp (rel, "stylesheet"))
+ entry->link_inline_p = 1;
+ }
}
}
break;
@@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
/* Analyze HTML tags FILE and construct a list of URLs referenced from
it. It merges relative links in FILE with URL. It is aware of
- and does the right thing.
-
- If dash_p_leaf_HTML is non-zero, only the elements needed to render
- FILE ("non-external" links) will be returned. */
+ and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
- int *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
{
struct file_memory *fm;
struct collect_urls_closure closure;
@@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
closure.base = NULL;
closure.parent_base = url ? url : opt.base_href;
closure.document_file = file;
- closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0;
if (!interesting_tags)
diff --git a/src/recur.c b/src/recur.c
index 0aa96498..6b8c41b0 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
if (descend)
{
int meta_disallow_follow = 0;
- struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
- &meta_disallow_follow);
+ struct urlpos *children
+ = get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
@@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
{
if (child->ignore_when_downloading)
continue;
+ if (dash_p_leaf_HTML && !child->link_inline_p)
+ continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist))
{
@@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
/* 4. Check for parent directory.
If we descended to a different host or changed the scheme, ignore
- opt.no_parent. Also ignore it for -p leaf retrievals. */
+ opt.no_parent. Also ignore it for documents needed to display
+ the parent page when in -p mode. */
if (opt.no_parent
&& u->scheme == start_url_parsed->scheme
&& 0 == strcasecmp (u->host, start_url_parsed->host)
- && u->port == start_url_parsed->port)
+ && u->port == start_url_parsed->port
+ && !(opt.page_requisites && upos->link_inline_p))
{
if (!frontcmp (start_url_parsed->dir, u->dir))
{
@@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (u->file[0] != '\0'
&& ((suf = suffix (url)) == NULL
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
- || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+ || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
{
if (!acceptable (u->file))
{
@@ -674,7 +678,7 @@ convert_all_links (void)
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
- urls = get_urls_html (html->string, url, FALSE, NULL);
+ urls = get_urls_html (html->string, url, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
diff --git a/src/retr.c b/src/retr.c
index 85822679..6c12462a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
uerr_t status;
struct urlpos *url_list, *cur_url;
- url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
+ url_list = (html ? get_urls_html (file, NULL, NULL)
: get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
diff --git a/src/url.h b/src/url.h
index 836cdad1..3c42c5aa 100644
--- a/src/url.h
+++ b/src/url.h
@@ -79,16 +79,17 @@ struct urlpos {
char *local_name; /* local file to which it was saved
(used by convert_links) */
- int ignore_when_downloading; /* reserved for special links such as
- which are used
- when converting links, but ignored
- when downloading. */
+ /* reserved for special links such as which are
+ used when converting links, but ignored when downloading. */
+ unsigned int ignore_when_downloading :1;
/* Information about the original link: */
- int link_relative_p; /* was the link relative? */
- int link_complete_p; /* was the link complete (with the
- host name, etc.) */
- int link_base_p; /* was the link */
+
+ unsigned int link_relative_p :1; /* was the link relative? */
+ unsigned int link_complete_p :1; /* was the link complete (with the
+ host name, etc.) */
+ unsigned int link_base_p :1; /* was the link */
+ unsigned int link_inline_p :1; /* needed to render the page. */
/* Conversion requirements: */
enum convert_options convert; /* is conversion required? */
@@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
char *url_string PARAMS ((const struct url *, int));
struct urlpos *get_urls_file PARAMS ((const char *));
-struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
+struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
void free_urlpos PARAMS ((struct urlpos *));
char *uri_merge PARAMS ((const char *, const char *));