1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Ignore -np when in -p mode.

Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-30 13:17:53 -08:00
parent 5deec11234
commit a4db28e20f
6 changed files with 57 additions and 59 deletions

6
TODO
View File

@ -17,12 +17,6 @@ changes.
* -p should probably go "_two_ more hops" on <FRAMESET> pages. * -p should probably go "_two_ more hops" on <FRAMESET> pages.
* Only normal link-following recursion should respect -np. Page-requisite
recursion should not. When -np -p is specified, Wget should still retrieve
requisite images and such on the server, even if they aren't in that directory
or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files
from other hosts.
* Add a --range parameter allowing you to explicitly specify a range of bytes to * Add a --range parameter allowing you to explicitly specify a range of bytes to
get from a file over HTTP (FTP only supports ranges ending at the end of the get from a file over HTTP (FTP only supports ranges ending at the end of the
file, though forcibly disconnecting from the server at the desired endpoint file, though forcibly disconnecting from the server at the desired endpoint

View File

@ -1,3 +1,16 @@
2001-11-30 Hrvoje Niksic <hniksic@arsdigita.com>
* recur.c (retrieve_tree): Skip the non-inline entries when
enqueuing the children of a leaf HTML node in -p mode.
(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
"inline".
* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
(collect_tags_mapper): When an entry is "inline", mark it as such.
* recur.c (descend_url_p): Fix test when checking for
acceptance/rejection rules.
2001-10-31 Daniel BODEA <dali@dali-designs.com> 2001-10-31 Daniel BODEA <dali@dali-designs.com>
* netrc.c (search_netrc): When slack_default is 0, still look for * netrc.c (search_netrc): When slack_default is 0, still look for

View File

@ -287,9 +287,6 @@ struct collect_urls_closure {
struct urlpos *head, *tail; /* List of URLs */ struct urlpos *head, *tail; /* List of URLs */
const char *parent_base; /* Base of the current document. */ const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */ const char *document_file; /* File name of this document. */
int dash_p_leaf_HTML; /* Whether -p is specified, and this
document is the "leaf" node of the
HTML tree. */
int nofollow; /* whether NOFOLLOW was specified in a int nofollow; /* whether NOFOLLOW was specified in a
<meta name=robots> tag. */ <meta name=robots> tag. */
}; };
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid); for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
i++) i++)
{ {
char *attr_value; if (0 == strcasecmp (tag->attrs[id].name,
if (closure->dash_p_leaf_HTML url_tag_attr_map[i].attr_name))
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
/* If we're at a -p leaf node, we don't want to retrieve
links to references we know are external to this document,
such as <a href=...>. */
continue;
if (!strcasecmp (tag->attrs[id].name,
url_tag_attr_map[i].attr_name))
{ {
attr_value = tag->attrs[id].value; char *attr_value = tag->attrs[id].value;
if (attr_value) if (attr_value)
handle_link (closure, attr_value, tag, id); {
struct urlpos *entry;
entry = handle_link (closure, attr_value, tag, id);
if (entry != NULL
&& !(url_tag_attr_map[i].flags & AF_EXTERNAL))
entry->link_inline_p = 1;
}
} }
} }
} }
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
case TAG_LINK: case TAG_LINK:
{ {
int id; int id;
char *rel = find_attr (tag, "rel", NULL);
char *href = find_attr (tag, "href", &id); char *href = find_attr (tag, "href", &id);
/* All <link href="..."> link references are external,
except for <link rel="stylesheet" href="...">. */
if (href) if (href)
{ {
/* In the normal case, all <link href=...> tags are struct urlpos *entry;
fair game. entry = handle_link (closure, href, tag, id);
if (entry != NULL)
In the special case of when -p is active, however, {
and we're at a leaf node (relative to the -l char *rel = find_attr (tag, "rel", NULL);
max. depth) in the HTML document tree, the only if (rel && 0 == strcasecmp (rel, "stylesheet"))
<LINK> tag we'll follow is a <LINK REL= entry->link_inline_p = 1;
"stylesheet">, as it'll be necessary for displaying }
this document properly. We won't follow other
<LINK> tags, like <LINK REL="home">, for instance,
as they refer to external documents. */
if (!closure->dash_p_leaf_HTML
|| (rel && !strcasecmp (rel, "stylesheet")))
handle_link (closure, href, tag, id);
} }
} }
break; break;
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
/* Analyze HTML tags FILE and construct a list of URLs referenced from /* Analyze HTML tags FILE and construct a list of URLs referenced from
it. It merges relative links in FILE with URL. It is aware of it. It merges relative links in FILE with URL. It is aware of
<base href=...> and does the right thing. <base href=...> and does the right thing. */
If dash_p_leaf_HTML is non-zero, only the elements needed to render
FILE ("non-external" links) will be returned. */
struct urlpos * struct urlpos *
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML, get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
int *meta_disallow_follow)
{ {
struct file_memory *fm; struct file_memory *fm;
struct collect_urls_closure closure; struct collect_urls_closure closure;
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
closure.base = NULL; closure.base = NULL;
closure.parent_base = url ? url : opt.base_href; closure.parent_base = url ? url : opt.base_href;
closure.document_file = file; closure.document_file = file;
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0; closure.nofollow = 0;
if (!interesting_tags) if (!interesting_tags)

View File

@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
if (descend) if (descend)
{ {
int meta_disallow_follow = 0; int meta_disallow_follow = 0;
struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML, struct urlpos *children
&meta_disallow_follow); = get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow) if (opt.use_robots && meta_disallow_follow)
{ {
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
{ {
if (child->ignore_when_downloading) if (child->ignore_when_downloading)
continue; continue;
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed, if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist)) blacklist))
{ {
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
/* 4. Check for parent directory. /* 4. Check for parent directory.
If we descended to a different host or changed the scheme, ignore If we descended to a different host or changed the scheme, ignore
opt.no_parent. Also ignore it for -p leaf retrievals. */ opt.no_parent. Also ignore it for documents needed to display
the parent page when in -p mode. */
if (opt.no_parent if (opt.no_parent
&& u->scheme == start_url_parsed->scheme && u->scheme == start_url_parsed->scheme
&& 0 == strcasecmp (u->host, start_url_parsed->host) && 0 == strcasecmp (u->host, start_url_parsed->host)
&& u->port == start_url_parsed->port) && u->port == start_url_parsed->port
&& !(opt.page_requisites && upos->link_inline_p))
{ {
if (!frontcmp (start_url_parsed->dir, u->dir)) if (!frontcmp (start_url_parsed->dir, u->dir))
{ {
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
if (u->file[0] != '\0' if (u->file[0] != '\0'
&& ((suf = suffix (url)) == NULL && ((suf = suffix (url)) == NULL
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm")) || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|| (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel))) || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
{ {
if (!acceptable (u->file)) if (!acceptable (u->file))
{ {
@ -674,7 +678,7 @@ convert_all_links (void)
DEBUGP (("I cannot find the corresponding URL.\n")); DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */ /* Parse the HTML file... */
urls = get_urls_html (html->string, url, FALSE, NULL); urls = get_urls_html (html->string, url, NULL);
/* We don't respect meta_disallow_follow here because, even if /* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the the file is not followed, we might still want to convert the

View File

@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
uerr_t status; uerr_t status;
struct urlpos *url_list, *cur_url; struct urlpos *url_list, *cur_url;
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL) url_list = (html ? get_urls_html (file, NULL, NULL)
: get_urls_file (file)); : get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */ status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */ *count = 0; /* Reset the URL count. */

View File

@ -79,16 +79,17 @@ struct urlpos {
char *local_name; /* local file to which it was saved char *local_name; /* local file to which it was saved
(used by convert_links) */ (used by convert_links) */
int ignore_when_downloading; /* reserved for special links such as /* reserved for special links such as <base href="..."> which are
<base href="..."> which are used used when converting links, but ignored when downloading. */
when converting links, but ignored unsigned int ignore_when_downloading :1;
when downloading. */
/* Information about the original link: */ /* Information about the original link: */
int link_relative_p; /* was the link relative? */
int link_complete_p; /* was the link complete (with the unsigned int link_relative_p :1; /* was the link relative? */
host name, etc.) */ unsigned int link_complete_p :1; /* was the link complete (with the
int link_base_p; /* was the link <base href=...> */ host name, etc.) */
unsigned int link_base_p :1; /* was the link <base href=...> */
unsigned int link_inline_p :1; /* needed to render the page. */
/* Conversion requirements: */ /* Conversion requirements: */
enum convert_options convert; /* is conversion required? */ enum convert_options convert; /* is conversion required? */
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
char *url_string PARAMS ((const struct url *, int)); char *url_string PARAMS ((const struct url *, int));
struct urlpos *get_urls_file PARAMS ((const char *)); struct urlpos *get_urls_file PARAMS ((const char *));
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *)); struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
void free_urlpos PARAMS ((struct urlpos *)); void free_urlpos PARAMS ((struct urlpos *));
char *uri_merge PARAMS ((const char *, const char *)); char *uri_merge PARAMS ((const char *, const char *));