mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Ignore -np when in -p mode.
Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
This commit is contained in:
parent
5deec11234
commit
a4db28e20f
6
TODO
6
TODO
@ -17,12 +17,6 @@ changes.
|
|||||||
|
|
||||||
* -p should probably go "_two_ more hops" on <FRAMESET> pages.
|
* -p should probably go "_two_ more hops" on <FRAMESET> pages.
|
||||||
|
|
||||||
* Only normal link-following recursion should respect -np. Page-requisite
|
|
||||||
recursion should not. When -np -p is specified, Wget should still retrieve
|
|
||||||
requisite images and such on the server, even if they aren't in that directory
|
|
||||||
or a subdirectory of it. Likewise, -H -np -p should retrieve requisite files
|
|
||||||
from other hosts.
|
|
||||||
|
|
||||||
* Add a --range parameter allowing you to explicitly specify a range of bytes to
|
* Add a --range parameter allowing you to explicitly specify a range of bytes to
|
||||||
get from a file over HTTP (FTP only supports ranges ending at the end of the
|
get from a file over HTTP (FTP only supports ranges ending at the end of the
|
||||||
file, though forcibly disconnecting from the server at the desired endpoint
|
file, though forcibly disconnecting from the server at the desired endpoint
|
||||||
|
@ -1,3 +1,16 @@
|
|||||||
|
2001-11-30 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||||
|
|
||||||
|
* recur.c (retrieve_tree): Skip the non-inline entries when
|
||||||
|
enqueuing the children of a leaf HTML node in -p mode.
|
||||||
|
(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
|
||||||
|
"inline".
|
||||||
|
|
||||||
|
* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
|
||||||
|
(collect_tags_mapper): When an entry is "inline", mark it as such.
|
||||||
|
|
||||||
|
* recur.c (descend_url_p): Fix test when checking for
|
||||||
|
acceptance/rejection rules.
|
||||||
|
|
||||||
2001-10-31 Daniel BODEA <dali@dali-designs.com>
|
2001-10-31 Daniel BODEA <dali@dali-designs.com>
|
||||||
|
|
||||||
* netrc.c (search_netrc): When slack_default is 0, still look for
|
* netrc.c (search_netrc): When slack_default is 0, still look for
|
||||||
|
@ -287,9 +287,6 @@ struct collect_urls_closure {
|
|||||||
struct urlpos *head, *tail; /* List of URLs */
|
struct urlpos *head, *tail; /* List of URLs */
|
||||||
const char *parent_base; /* Base of the current document. */
|
const char *parent_base; /* Base of the current document. */
|
||||||
const char *document_file; /* File name of this document. */
|
const char *document_file; /* File name of this document. */
|
||||||
int dash_p_leaf_HTML; /* Whether -p is specified, and this
|
|
||||||
document is the "leaf" node of the
|
|
||||||
HTML tree. */
|
|
||||||
int nofollow; /* whether NOFOLLOW was specified in a
|
int nofollow; /* whether NOFOLLOW was specified in a
|
||||||
<meta name=robots> tag. */
|
<meta name=robots> tag. */
|
||||||
};
|
};
|
||||||
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
|||||||
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
|
for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
|
||||||
i++)
|
i++)
|
||||||
{
|
{
|
||||||
char *attr_value;
|
if (0 == strcasecmp (tag->attrs[id].name,
|
||||||
if (closure->dash_p_leaf_HTML
|
url_tag_attr_map[i].attr_name))
|
||||||
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
|
|
||||||
/* If we're at a -p leaf node, we don't want to retrieve
|
|
||||||
links to references we know are external to this document,
|
|
||||||
such as <a href=...>. */
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!strcasecmp (tag->attrs[id].name,
|
|
||||||
url_tag_attr_map[i].attr_name))
|
|
||||||
{
|
{
|
||||||
attr_value = tag->attrs[id].value;
|
char *attr_value = tag->attrs[id].value;
|
||||||
if (attr_value)
|
if (attr_value)
|
||||||
handle_link (closure, attr_value, tag, id);
|
{
|
||||||
|
struct urlpos *entry;
|
||||||
|
entry = handle_link (closure, attr_value, tag, id);
|
||||||
|
if (entry != NULL
|
||||||
|
&& !(url_tag_attr_map[i].flags & AF_EXTERNAL))
|
||||||
|
entry->link_inline_p = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
|||||||
case TAG_LINK:
|
case TAG_LINK:
|
||||||
{
|
{
|
||||||
int id;
|
int id;
|
||||||
char *rel = find_attr (tag, "rel", NULL);
|
|
||||||
char *href = find_attr (tag, "href", &id);
|
char *href = find_attr (tag, "href", &id);
|
||||||
|
|
||||||
|
/* All <link href="..."> link references are external,
|
||||||
|
except for <link rel="stylesheet" href="...">. */
|
||||||
if (href)
|
if (href)
|
||||||
{
|
{
|
||||||
/* In the normal case, all <link href=...> tags are
|
struct urlpos *entry;
|
||||||
fair game.
|
entry = handle_link (closure, href, tag, id);
|
||||||
|
if (entry != NULL)
|
||||||
In the special case of when -p is active, however,
|
{
|
||||||
and we're at a leaf node (relative to the -l
|
char *rel = find_attr (tag, "rel", NULL);
|
||||||
max. depth) in the HTML document tree, the only
|
if (rel && 0 == strcasecmp (rel, "stylesheet"))
|
||||||
<LINK> tag we'll follow is a <LINK REL=
|
entry->link_inline_p = 1;
|
||||||
"stylesheet">, as it'll be necessary for displaying
|
}
|
||||||
this document properly. We won't follow other
|
|
||||||
<LINK> tags, like <LINK REL="home">, for instance,
|
|
||||||
as they refer to external documents. */
|
|
||||||
if (!closure->dash_p_leaf_HTML
|
|
||||||
|| (rel && !strcasecmp (rel, "stylesheet")))
|
|
||||||
handle_link (closure, href, tag, id);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
|||||||
|
|
||||||
/* Analyze HTML tags FILE and construct a list of URLs referenced from
|
/* Analyze HTML tags FILE and construct a list of URLs referenced from
|
||||||
it. It merges relative links in FILE with URL. It is aware of
|
it. It merges relative links in FILE with URL. It is aware of
|
||||||
<base href=...> and does the right thing.
|
<base href=...> and does the right thing. */
|
||||||
|
|
||||||
If dash_p_leaf_HTML is non-zero, only the elements needed to render
|
|
||||||
FILE ("non-external" links) will be returned. */
|
|
||||||
struct urlpos *
|
struct urlpos *
|
||||||
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
|
get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
|
||||||
int *meta_disallow_follow)
|
|
||||||
{
|
{
|
||||||
struct file_memory *fm;
|
struct file_memory *fm;
|
||||||
struct collect_urls_closure closure;
|
struct collect_urls_closure closure;
|
||||||
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
|
|||||||
closure.base = NULL;
|
closure.base = NULL;
|
||||||
closure.parent_base = url ? url : opt.base_href;
|
closure.parent_base = url ? url : opt.base_href;
|
||||||
closure.document_file = file;
|
closure.document_file = file;
|
||||||
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
|
|
||||||
closure.nofollow = 0;
|
closure.nofollow = 0;
|
||||||
|
|
||||||
if (!interesting_tags)
|
if (!interesting_tags)
|
||||||
|
16
src/recur.c
16
src/recur.c
@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
|
|||||||
if (descend)
|
if (descend)
|
||||||
{
|
{
|
||||||
int meta_disallow_follow = 0;
|
int meta_disallow_follow = 0;
|
||||||
struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
|
struct urlpos *children
|
||||||
&meta_disallow_follow);
|
= get_urls_html (file, url, &meta_disallow_follow);
|
||||||
|
|
||||||
if (opt.use_robots && meta_disallow_follow)
|
if (opt.use_robots && meta_disallow_follow)
|
||||||
{
|
{
|
||||||
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
|
|||||||
{
|
{
|
||||||
if (child->ignore_when_downloading)
|
if (child->ignore_when_downloading)
|
||||||
continue;
|
continue;
|
||||||
|
if (dash_p_leaf_HTML && !child->link_inline_p)
|
||||||
|
continue;
|
||||||
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
|
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
|
||||||
blacklist))
|
blacklist))
|
||||||
{
|
{
|
||||||
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
/* 4. Check for parent directory.
|
/* 4. Check for parent directory.
|
||||||
|
|
||||||
If we descended to a different host or changed the scheme, ignore
|
If we descended to a different host or changed the scheme, ignore
|
||||||
opt.no_parent. Also ignore it for -p leaf retrievals. */
|
opt.no_parent. Also ignore it for documents needed to display
|
||||||
|
the parent page when in -p mode. */
|
||||||
if (opt.no_parent
|
if (opt.no_parent
|
||||||
&& u->scheme == start_url_parsed->scheme
|
&& u->scheme == start_url_parsed->scheme
|
||||||
&& 0 == strcasecmp (u->host, start_url_parsed->host)
|
&& 0 == strcasecmp (u->host, start_url_parsed->host)
|
||||||
&& u->port == start_url_parsed->port)
|
&& u->port == start_url_parsed->port
|
||||||
|
&& !(opt.page_requisites && upos->link_inline_p))
|
||||||
{
|
{
|
||||||
if (!frontcmp (start_url_parsed->dir, u->dir))
|
if (!frontcmp (start_url_parsed->dir, u->dir))
|
||||||
{
|
{
|
||||||
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
if (u->file[0] != '\0'
|
if (u->file[0] != '\0'
|
||||||
&& ((suf = suffix (url)) == NULL
|
&& ((suf = suffix (url)) == NULL
|
||||||
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|
||||||
|| (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
|
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
|
||||||
{
|
{
|
||||||
if (!acceptable (u->file))
|
if (!acceptable (u->file))
|
||||||
{
|
{
|
||||||
@ -674,7 +678,7 @@ convert_all_links (void)
|
|||||||
DEBUGP (("I cannot find the corresponding URL.\n"));
|
DEBUGP (("I cannot find the corresponding URL.\n"));
|
||||||
|
|
||||||
/* Parse the HTML file... */
|
/* Parse the HTML file... */
|
||||||
urls = get_urls_html (html->string, url, FALSE, NULL);
|
urls = get_urls_html (html->string, url, NULL);
|
||||||
|
|
||||||
/* We don't respect meta_disallow_follow here because, even if
|
/* We don't respect meta_disallow_follow here because, even if
|
||||||
the file is not followed, we might still want to convert the
|
the file is not followed, we might still want to convert the
|
||||||
|
@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
|
|||||||
uerr_t status;
|
uerr_t status;
|
||||||
struct urlpos *url_list, *cur_url;
|
struct urlpos *url_list, *cur_url;
|
||||||
|
|
||||||
url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
|
url_list = (html ? get_urls_html (file, NULL, NULL)
|
||||||
: get_urls_file (file));
|
: get_urls_file (file));
|
||||||
status = RETROK; /* Suppose everything is OK. */
|
status = RETROK; /* Suppose everything is OK. */
|
||||||
*count = 0; /* Reset the URL count. */
|
*count = 0; /* Reset the URL count. */
|
||||||
|
19
src/url.h
19
src/url.h
@ -79,16 +79,17 @@ struct urlpos {
|
|||||||
char *local_name; /* local file to which it was saved
|
char *local_name; /* local file to which it was saved
|
||||||
(used by convert_links) */
|
(used by convert_links) */
|
||||||
|
|
||||||
int ignore_when_downloading; /* reserved for special links such as
|
/* reserved for special links such as <base href="..."> which are
|
||||||
<base href="..."> which are used
|
used when converting links, but ignored when downloading. */
|
||||||
when converting links, but ignored
|
unsigned int ignore_when_downloading :1;
|
||||||
when downloading. */
|
|
||||||
|
|
||||||
/* Information about the original link: */
|
/* Information about the original link: */
|
||||||
int link_relative_p; /* was the link relative? */
|
|
||||||
int link_complete_p; /* was the link complete (with the
|
unsigned int link_relative_p :1; /* was the link relative? */
|
||||||
host name, etc.) */
|
unsigned int link_complete_p :1; /* was the link complete (with the
|
||||||
int link_base_p; /* was the link <base href=...> */
|
host name, etc.) */
|
||||||
|
unsigned int link_base_p :1; /* was the link <base href=...> */
|
||||||
|
unsigned int link_inline_p :1; /* needed to render the page. */
|
||||||
|
|
||||||
/* Conversion requirements: */
|
/* Conversion requirements: */
|
||||||
enum convert_options convert; /* is conversion required? */
|
enum convert_options convert; /* is conversion required? */
|
||||||
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
|
|||||||
char *url_string PARAMS ((const struct url *, int));
|
char *url_string PARAMS ((const struct url *, int));
|
||||||
|
|
||||||
struct urlpos *get_urls_file PARAMS ((const char *));
|
struct urlpos *get_urls_file PARAMS ((const char *));
|
||||||
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
|
struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
|
||||||
void free_urlpos PARAMS ((struct urlpos *));
|
void free_urlpos PARAMS ((struct urlpos *));
|
||||||
|
|
||||||
char *uri_merge PARAMS ((const char *, const char *));
|
char *uri_merge PARAMS ((const char *, const char *));
|
||||||
|
Loading…
Reference in New Issue
Block a user