[svn] Ignore -np when in -p mode.

Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-30 13:17:53 -08:00
parent 5deec11234
commit a4db28e20f
6 changed files with 57 additions and 59 deletions

6
TODO
View File

@ -17,12 +17,6 @@ changes.
* -p should probably go "_two_ more hops" on <FRAMESET> pages.
* Only normal link-following recursion should respect -np. Page-requisite
recursion should not. When -np -p is specified, Wget should still retrieve
requisite images and such on the server, even if they aren't in that directory
or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files
from other hosts.
* Add a --range parameter allowing you to explicitly specify a range of bytes to
get from a file over HTTP (FTP only supports ranges ending at the end of the
file, though forcibly disconnecting from the server at the desired endpoint

View File

@ -1,3 +1,16 @@
2001-11-30 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (retrieve_tree): Skip the non-inline entries when
enqueuing the children of a leaf HTML node in -p mode.
(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
"inline".
* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
(collect_tags_mapper): When an entry is "inline", mark it as such.
* recur.c (descend_url_p): Fix test when checking for
acceptance/rejection rules.
2001-10-31 Daniel BODEA <dali@dali-designs.com>
* netrc.c (search_netrc): When slack_default is 0, still look for

View File

@ -287,9 +287,6 @@ struct collect_urls_closure {
struct urlpos *head, *tail; /* List of URLs */
const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */
int dash_p_leaf_HTML; /* Whether -p is specified, and this
document is the "leaf" node of the
HTML tree. */
int nofollow; /* whether NOFOLLOW was specified in a
<meta name=robots> tag. */
};
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
i++)
{
char *attr_value;
if (closure->dash_p_leaf_HTML
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
/* If we're at a -p leaf node, we don't want to retrieve
links to references we know are external to this document,
such as <a href=...>. */
continue;
if (!strcasecmp (tag->attrs[id].name,
url_tag_attr_map[i].attr_name))
if (0 == strcasecmp (tag->attrs[id].name,
url_tag_attr_map[i].attr_name))
{
attr_value = tag->attrs[id].value;
char *attr_value = tag->attrs[id].value;
if (attr_value)
handle_link (closure, attr_value, tag, id);
{
struct urlpos *entry;
entry = handle_link (closure, attr_value, tag, id);
if (entry != NULL
&& !(url_tag_attr_map[i].flags & AF_EXTERNAL))
entry->link_inline_p = 1;
}
}
}
}
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
case TAG_LINK:
{
int id;
char *rel = find_attr (tag, "rel", NULL);
char *href = find_attr (tag, "href", &id);
/* All <link href="..."> link references are external,
except for <link rel="stylesheet" href="...">. */
if (href)
{
/* In the normal case, all <link href=...> tags are
fair game.
In the special case of when -p is active, however,
and we're at a leaf node (relative to the -l
max. depth) in the HTML document tree, the only
<LINK> tag we'll follow is a <LINK REL=
"stylesheet">, as it'll be necessary for displaying
this document properly. We won't follow other
<LINK> tags, like <LINK REL="home">, for instance,
as they refer to external documents. */
if (!closure->dash_p_leaf_HTML
|| (rel && !strcasecmp (rel, "stylesheet")))
handle_link (closure, href, tag, id);
struct urlpos *entry;
entry = handle_link (closure, href, tag, id);
if (entry != NULL)
{
char *rel = find_attr (tag, "rel", NULL);
if (rel && 0 == strcasecmp (rel, "stylesheet"))
entry->link_inline_p = 1;
}
}
}
break;
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
/* Analyze HTML tags FILE and construct a list of URLs referenced from
it. It merges relative links in FILE with URL. It is aware of
<base href=...> and does the right thing.
If dash_p_leaf_HTML is non-zero, only the elements needed to render
FILE ("non-external" links) will be returned. */
<base href=...> and does the right thing. */
struct urlpos *
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
int *meta_disallow_follow)
get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
{
struct file_memory *fm;
struct collect_urls_closure closure;
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
closure.base = NULL;
closure.parent_base = url ? url : opt.base_href;
closure.document_file = file;
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0;
if (!interesting_tags)

View File

@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
if (descend)
{
int meta_disallow_follow = 0;
struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
&meta_disallow_follow);
struct urlpos *children
= get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
{
if (child->ignore_when_downloading)
continue;
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist))
{
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
/* 4. Check for parent directory.
If we descended to a different host or changed the scheme, ignore
opt.no_parent. Also ignore it for -p leaf retrievals. */
opt.no_parent. Also ignore it for documents needed to display
the parent page when in -p mode. */
if (opt.no_parent
&& u->scheme == start_url_parsed->scheme
&& 0 == strcasecmp (u->host, start_url_parsed->host)
&& u->port == start_url_parsed->port)
&& u->port == start_url_parsed->port
&& !(opt.page_requisites && upos->link_inline_p))
{
if (!frontcmp (start_url_parsed->dir, u->dir))
{
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (u->file[0] != '\0'
&& ((suf = suffix (url)) == NULL
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|| (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
{
if (!acceptable (u->file))
{
@ -674,7 +678,7 @@ convert_all_links (void)
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
urls = get_urls_html (html->string, url, FALSE, NULL);
urls = get_urls_html (html->string, url, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the

View File

@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
uerr_t status;
struct urlpos *url_list, *cur_url;
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
url_list = (html ? get_urls_html (file, NULL, NULL)
: get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */

View File

@ -79,16 +79,17 @@ struct urlpos {
char *local_name; /* local file to which it was saved
(used by convert_links) */
int ignore_when_downloading; /* reserved for special links such as
<base href="..."> which are used
when converting links, but ignored
when downloading. */
/* reserved for special links such as <base href="..."> which are
used when converting links, but ignored when downloading. */
unsigned int ignore_when_downloading :1;
/* Information about the original link: */
int link_relative_p; /* was the link relative? */
int link_complete_p; /* was the link complete (with the
host name, etc.) */
int link_base_p; /* was the link <base href=...> */
unsigned int link_relative_p :1; /* was the link relative? */
unsigned int link_complete_p :1; /* was the link complete (with the
host name, etc.) */
unsigned int link_base_p :1; /* was the link <base href=...> */
unsigned int link_inline_p :1; /* needed to render the page. */
/* Conversion requirements: */
enum convert_options convert; /* is conversion required? */
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
char *url_string PARAMS ((const struct url *, int));
struct urlpos *get_urls_file PARAMS ((const char *));
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
void free_urlpos PARAMS ((struct urlpos *));
char *uri_merge PARAMS ((const char *, const char *));