mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Ignore -np when in -p mode.
Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
This commit is contained in:
parent
5deec11234
commit
a4db28e20f
6
TODO
6
TODO
@ -17,12 +17,6 @@ changes.
|
||||
|
||||
* -p should probably go "_two_ more hops" on <FRAMESET> pages.
|
||||
|
||||
* Only normal link-following recursion should respect -np. Page-requisite
|
||||
recursion should not. When -np -p is specified, Wget should still retrieve
|
||||
requisite images and such on the server, even if they aren't in that directory
|
||||
or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files
|
||||
from other hosts.
|
||||
|
||||
* Add a --range parameter allowing you to explicitly specify a range of bytes to
|
||||
get from a file over HTTP (FTP only supports ranges ending at the end of the
|
||||
file, though forcibly disconnecting from the server at the desired endpoint
|
||||
|
@ -1,3 +1,16 @@
|
||||
2001-11-30 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* recur.c (retrieve_tree): Skip the non-inline entries when
|
||||
enqueuing the children of a leaf HTML node in -p mode.
|
||||
(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
|
||||
"inline".
|
||||
|
||||
* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
|
||||
(collect_tags_mapper): When an entry is "inline", mark it as such.
|
||||
|
||||
* recur.c (descend_url_p): Fix test when checking for
|
||||
acceptance/rejection rules.
|
||||
|
||||
2001-10-31 Daniel BODEA <dali@dali-designs.com>
|
||||
|
||||
* netrc.c (search_netrc): When slack_default is 0, still look for
|
||||
|
@ -287,9 +287,6 @@ struct collect_urls_closure {
|
||||
struct urlpos *head, *tail; /* List of URLs */
|
||||
const char *parent_base; /* Base of the current document. */
|
||||
const char *document_file; /* File name of this document. */
|
||||
int dash_p_leaf_HTML; /* Whether -p is specified, and this
|
||||
document is the "leaf" node of the
|
||||
HTML tree. */
|
||||
int nofollow; /* whether NOFOLLOW was specified in a
|
||||
<meta name=robots> tag. */
|
||||
};
|
||||
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
|
||||
i++)
|
||||
{
|
||||
char *attr_value;
|
||||
if (closure->dash_p_leaf_HTML
|
||||
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
|
||||
/* If we're at a -p leaf node, we don't want to retrieve
|
||||
links to references we know are external to this document,
|
||||
such as <a href=...>. */
|
||||
continue;
|
||||
|
||||
if (!strcasecmp (tag->attrs[id].name,
|
||||
if (0 == strcasecmp (tag->attrs[id].name,
|
||||
url_tag_attr_map[i].attr_name))
|
||||
{
|
||||
attr_value = tag->attrs[id].value;
|
||||
char *attr_value = tag->attrs[id].value;
|
||||
if (attr_value)
|
||||
handle_link (closure, attr_value, tag, id);
|
||||
{
|
||||
struct urlpos *entry;
|
||||
entry = handle_link (closure, attr_value, tag, id);
|
||||
if (entry != NULL
|
||||
&& !(url_tag_attr_map[i].flags & AF_EXTERNAL))
|
||||
entry->link_inline_p = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
case TAG_LINK:
|
||||
{
|
||||
int id;
|
||||
char *rel = find_attr (tag, "rel", NULL);
|
||||
char *href = find_attr (tag, "href", &id);
|
||||
|
||||
/* All <link href="..."> link references are external,
|
||||
except for <link rel="stylesheet" href="...">. */
|
||||
if (href)
|
||||
{
|
||||
/* In the normal case, all <link href=...> tags are
|
||||
fair game.
|
||||
|
||||
In the special case of when -p is active, however,
|
||||
and we're at a leaf node (relative to the -l
|
||||
max. depth) in the HTML document tree, the only
|
||||
<LINK> tag we'll follow is a <LINK REL=
|
||||
"stylesheet">, as it'll be necessary for displaying
|
||||
this document properly. We won't follow other
|
||||
<LINK> tags, like <LINK REL="home">, for instance,
|
||||
as they refer to external documents. */
|
||||
if (!closure->dash_p_leaf_HTML
|
||||
|| (rel && !strcasecmp (rel, "stylesheet")))
|
||||
handle_link (closure, href, tag, id);
|
||||
struct urlpos *entry;
|
||||
entry = handle_link (closure, href, tag, id);
|
||||
if (entry != NULL)
|
||||
{
|
||||
char *rel = find_attr (tag, "rel", NULL);
|
||||
if (rel && 0 == strcasecmp (rel, "stylesheet"))
|
||||
entry->link_inline_p = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
|
||||
/* Analyze HTML tags FILE and construct a list of URLs referenced from
|
||||
it. It merges relative links in FILE with URL. It is aware of
|
||||
<base href=...> and does the right thing.
|
||||
|
||||
If dash_p_leaf_HTML is non-zero, only the elements needed to render
|
||||
FILE ("non-external" links) will be returned. */
|
||||
<base href=...> and does the right thing. */
|
||||
struct urlpos *
|
||||
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
|
||||
int *meta_disallow_follow)
|
||||
get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
struct collect_urls_closure closure;
|
||||
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
|
||||
closure.base = NULL;
|
||||
closure.parent_base = url ? url : opt.base_href;
|
||||
closure.document_file = file;
|
||||
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
|
||||
closure.nofollow = 0;
|
||||
|
||||
if (!interesting_tags)
|
||||
|
16
src/recur.c
16
src/recur.c
@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
|
||||
if (descend)
|
||||
{
|
||||
int meta_disallow_follow = 0;
|
||||
struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
|
||||
&meta_disallow_follow);
|
||||
struct urlpos *children
|
||||
= get_urls_html (file, url, &meta_disallow_follow);
|
||||
|
||||
if (opt.use_robots && meta_disallow_follow)
|
||||
{
|
||||
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
|
||||
{
|
||||
if (child->ignore_when_downloading)
|
||||
continue;
|
||||
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||
continue;
|
||||
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
|
||||
blacklist))
|
||||
{
|
||||
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
/* 4. Check for parent directory.
|
||||
|
||||
If we descended to a different host or changed the scheme, ignore
|
||||
opt.no_parent. Also ignore it for -p leaf retrievals. */
|
||||
opt.no_parent. Also ignore it for documents needed to display
|
||||
the parent page when in -p mode. */
|
||||
if (opt.no_parent
|
||||
&& u->scheme == start_url_parsed->scheme
|
||||
&& 0 == strcasecmp (u->host, start_url_parsed->host)
|
||||
&& u->port == start_url_parsed->port)
|
||||
&& u->port == start_url_parsed->port
|
||||
&& !(opt.page_requisites && upos->link_inline_p))
|
||||
{
|
||||
if (!frontcmp (start_url_parsed->dir, u->dir))
|
||||
{
|
||||
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
if (u->file[0] != '\0'
|
||||
&& ((suf = suffix (url)) == NULL
|
||||
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|
||||
|| (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
|
||||
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
|
||||
{
|
||||
if (!acceptable (u->file))
|
||||
{
|
||||
@ -674,7 +678,7 @@ convert_all_links (void)
|
||||
DEBUGP (("I cannot find the corresponding URL.\n"));
|
||||
|
||||
/* Parse the HTML file... */
|
||||
urls = get_urls_html (html->string, url, FALSE, NULL);
|
||||
urls = get_urls_html (html->string, url, NULL);
|
||||
|
||||
/* We don't respect meta_disallow_follow here because, even if
|
||||
the file is not followed, we might still want to convert the
|
||||
|
@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
|
||||
uerr_t status;
|
||||
struct urlpos *url_list, *cur_url;
|
||||
|
||||
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
|
||||
url_list = (html ? get_urls_html (file, NULL, NULL)
|
||||
: get_urls_file (file));
|
||||
status = RETROK; /* Suppose everything is OK. */
|
||||
*count = 0; /* Reset the URL count. */
|
||||
|
17
src/url.h
17
src/url.h
@ -79,16 +79,17 @@ struct urlpos {
|
||||
char *local_name; /* local file to which it was saved
|
||||
(used by convert_links) */
|
||||
|
||||
int ignore_when_downloading; /* reserved for special links such as
|
||||
<base href="..."> which are used
|
||||
when converting links, but ignored
|
||||
when downloading. */
|
||||
/* reserved for special links such as <base href="..."> which are
|
||||
used when converting links, but ignored when downloading. */
|
||||
unsigned int ignore_when_downloading :1;
|
||||
|
||||
/* Information about the original link: */
|
||||
int link_relative_p; /* was the link relative? */
|
||||
int link_complete_p; /* was the link complete (with the
|
||||
|
||||
unsigned int link_relative_p :1; /* was the link relative? */
|
||||
unsigned int link_complete_p :1; /* was the link complete (with the
|
||||
host name, etc.) */
|
||||
int link_base_p; /* was the link <base href=...> */
|
||||
unsigned int link_base_p :1; /* was the link <base href=...> */
|
||||
unsigned int link_inline_p :1; /* needed to render the page. */
|
||||
|
||||
/* Conversion requirements: */
|
||||
enum convert_options convert; /* is conversion required? */
|
||||
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
|
||||
char *url_string PARAMS ((const struct url *, int));
|
||||
|
||||
struct urlpos *get_urls_file PARAMS ((const char *));
|
||||
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
|
||||
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
|
||||
void free_urlpos PARAMS ((struct urlpos *));
|
||||
|
||||
char *uri_merge PARAMS ((const char *, const char *));
|
||||
|
Loading…
Reference in New Issue
Block a user