mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Handle <base href=...> when converting links.
Published in <sxsadxaae3t.fsf@florida.arsdigita.de>.
This commit is contained in:
parent
2e6e3f21f8
commit
df05e7ff10
19
NEWS
19
NEWS
@ -7,13 +7,24 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
|
||||
|
||||
* Changes in Wget 1.8.
|
||||
|
||||
** "Recursive retrieval" now uses a breadth-first algorithm.
|
||||
Recursive downloads are faster and consume *significantly* less memory
|
||||
than before.
|
||||
|
||||
** A new progress indicator is now available. Try it with
|
||||
--progress=bar or using `progress = bar' in `.wgetrc'.
|
||||
|
||||
** "Recursive retrieval" has been revamped:
|
||||
|
||||
*** Wget now traverses links breadth-first. This makes the
|
||||
calculation of depth much more reliable than before. Also, recursive
|
||||
downloads are faster and consume *significantly* less memory than
|
||||
before.
|
||||
|
||||
*** Links are converted only when the entire retrieval is complete.
|
||||
This is the only safe thing to do, as only then is it known what URLs
|
||||
have been downloaded.
|
||||
|
||||
*** BASE tags are handled correctly when converting links. Since Wget
|
||||
already resolves <base href="..."> when resolving handling URLs, link
|
||||
conversion now makes the BASE tags point to an empty string.
|
||||
|
||||
** Host directories now contain port information if the URL is at a
|
||||
non-standard port.
|
||||
|
||||
|
2
TODO
2
TODO
@ -55,8 +55,6 @@ changes.
|
||||
* Make -K compare X.orig to X and move the former on top of the latter if
|
||||
they're the same, rather than leaving identical .orig files laying around.
|
||||
|
||||
* Make `-k' convert <base href=...> too.
|
||||
|
||||
* Make `-k' check for files that were downloaded in the past and convert links
|
||||
to them in newly-downloaded documents.
|
||||
|
||||
|
@ -1,3 +1,14 @@
|
||||
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* url.c (convert_links): Handle CO_NULLIFY_BASE.
|
||||
|
||||
* recur.c (retrieve_tree): Ignore download-ignorable children.
|
||||
(convert_all_links): Specify CO_NULLIFY_BASE when link_base_p.
|
||||
|
||||
* html-url.c (handle_link): Return the newly created urlpos.
|
||||
(collect_tags_mapper): When dealing with BASE, store the base
|
||||
reference and mark it as download-ignorable.
|
||||
|
||||
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* url.c (convert_links): Attempt to quote '?' as "%3F" when
|
||||
|
@ -297,7 +297,7 @@ struct collect_urls_closure {
|
||||
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
|
||||
are the necessary context to store the position and size. */
|
||||
|
||||
static void
|
||||
static struct urlpos *
|
||||
handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
struct taginfo *tag, int attrid)
|
||||
{
|
||||
@ -316,7 +316,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
/* We have no base, and the link does not have a host
|
||||
attached to it. Nothing we can do. */
|
||||
/* #### Should we print a warning here? Wget 1.5.x used to. */
|
||||
return;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
url = url_parse (link_uri, NULL);
|
||||
@ -324,7 +324,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
{
|
||||
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
|
||||
closure->document_file, link_uri));
|
||||
return;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -344,7 +344,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
|
||||
closure->document_file, complete_uri));
|
||||
xfree (complete_uri);
|
||||
return;
|
||||
return NULL;
|
||||
}
|
||||
xfree (complete_uri);
|
||||
}
|
||||
@ -371,6 +371,8 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
|
||||
}
|
||||
else
|
||||
closure->tail = closure->head = newel;
|
||||
|
||||
return newel;
|
||||
}
|
||||
|
||||
/* Examine name and attributes of TAG and take appropriate action.
|
||||
@ -435,9 +437,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
{
|
||||
case TAG_BASE:
|
||||
{
|
||||
char *newbase = find_attr (tag, "href", NULL);
|
||||
struct urlpos *base_urlpos;
|
||||
int id;
|
||||
char *newbase = find_attr (tag, "href", &id);
|
||||
if (!newbase)
|
||||
break;
|
||||
|
||||
base_urlpos = handle_link (closure, newbase, tag, id);
|
||||
if (!base_urlpos)
|
||||
break;
|
||||
base_urlpos->ignore_when_downloading = 1;
|
||||
base_urlpos->link_base_p = 1;
|
||||
|
||||
if (closure->base)
|
||||
xfree (closure->base);
|
||||
if (closure->parent_base)
|
||||
@ -545,13 +556,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
}
|
||||
|
||||
/* Analyze HTML tags FILE and construct a list of URLs referenced from
|
||||
it. It merges relative links in FILE with THIS_URL. It is aware
|
||||
of <base href=...> and does the right thing.
|
||||
it. It merges relative links in FILE with URL. It is aware of
|
||||
<base href=...> and does the right thing.
|
||||
|
||||
If dash_p_leaf_HTML is non-zero, only the elements needed to render
|
||||
FILE ("non-external" links) will be returned. */
|
||||
struct urlpos *
|
||||
get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
|
||||
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
|
||||
int *meta_disallow_follow)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
@ -569,7 +580,7 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
|
||||
closure.text = fm->content;
|
||||
closure.head = closure.tail = NULL;
|
||||
closure.base = NULL;
|
||||
closure.parent_base = this_url ? this_url : opt.base_href;
|
||||
closure.parent_base = url ? url : opt.base_href;
|
||||
closure.document_file = file;
|
||||
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
|
||||
closure.nofollow = 0;
|
||||
|
13
src/recur.c
13
src/recur.c
@ -280,6 +280,8 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
for (; child; child = child->next)
|
||||
{
|
||||
if (child->ignore_when_downloading)
|
||||
continue;
|
||||
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
|
||||
blacklist))
|
||||
{
|
||||
@ -634,6 +636,15 @@ convert_all_links (void)
|
||||
char *local_name;
|
||||
struct url *u = cur_url->url;
|
||||
|
||||
if (cur_url->link_base_p)
|
||||
{
|
||||
/* Base references have been resolved by our parser, so
|
||||
we turn the base URL into an empty string. (Perhaps
|
||||
we should remove the tag entirely?) */
|
||||
cur_url->convert = CO_NULLIFY_BASE;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* We decide the direction of conversion according to whether
|
||||
a URL was downloaded. Downloaded URLs will be converted
|
||||
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
|
||||
@ -642,7 +653,7 @@ convert_all_links (void)
|
||||
DEBUGP (("%s marked for conversion, local %s\n",
|
||||
u->url, local_name));
|
||||
|
||||
/* Decide on the conversion direction. */
|
||||
/* Decide on the conversion type. */
|
||||
if (local_name)
|
||||
{
|
||||
/* We've downloaded this URL. Convert it to relative
|
||||
|
@ -462,6 +462,9 @@ retrieve_from_file (const char *file, int html, int *count)
|
||||
char *filename = NULL, *new_file;
|
||||
int dt;
|
||||
|
||||
if (cur_url->ignore_when_downloading)
|
||||
continue;
|
||||
|
||||
if (downloaded_exceeds_quota ())
|
||||
{
|
||||
status = QUOTEXC;
|
||||
|
73
src/url.c
73
src/url.c
@ -1698,7 +1698,7 @@ no_proxy_match (const char *host, const char **no_proxy)
|
||||
}
|
||||
|
||||
static void write_backup_file PARAMS ((const char *, downloaded_file_t));
|
||||
static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
|
||||
static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
|
||||
static char *local_quote_string PARAMS ((const char *));
|
||||
|
||||
/* Change the links in one HTML file. LINKS is a list of links in the
|
||||
@ -1765,6 +1765,7 @@ convert_links (const char *file, struct urlpos *links)
|
||||
read_file_free (fm);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Here we loop through all the URLs in file, replacing those of
|
||||
them that are downloaded with relative references. */
|
||||
p = fm->content;
|
||||
@ -1788,35 +1789,50 @@ convert_links (const char *file, struct urlpos *links)
|
||||
quote, to the outfile. */
|
||||
fwrite (p, 1, url_start - p, fp);
|
||||
p = url_start;
|
||||
if (link->convert == CO_CONVERT_TO_RELATIVE)
|
||||
|
||||
switch (link->convert)
|
||||
{
|
||||
case CO_CONVERT_TO_RELATIVE:
|
||||
/* Convert absolute URL to relative. */
|
||||
char *newname = construct_relative (file, link->local_name);
|
||||
char *quoted_newname = local_quote_string (newname);
|
||||
replace_attr (&p, link->size, fp, quoted_newname);
|
||||
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
||||
link->url->url, newname, link->pos, file));
|
||||
xfree (newname);
|
||||
xfree (quoted_newname);
|
||||
++to_file_count;
|
||||
}
|
||||
else if (link->convert == CO_CONVERT_TO_COMPLETE)
|
||||
{
|
||||
{
|
||||
char *newname = construct_relative (file, link->local_name);
|
||||
char *quoted_newname = local_quote_string (newname);
|
||||
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
||||
link->url->url, newname, link->pos, file));
|
||||
xfree (newname);
|
||||
xfree (quoted_newname);
|
||||
++to_file_count;
|
||||
break;
|
||||
}
|
||||
case CO_CONVERT_TO_COMPLETE:
|
||||
/* Convert the link to absolute URL. */
|
||||
char *newlink = link->url->url;
|
||||
char *quoted_newlink = html_quote_string (newlink);
|
||||
replace_attr (&p, link->size, fp, quoted_newlink);
|
||||
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
||||
newlink, link->pos, file));
|
||||
xfree (quoted_newlink);
|
||||
++to_url_count;
|
||||
{
|
||||
char *newlink = link->url->url;
|
||||
char *quoted_newlink = html_quote_string (newlink);
|
||||
p = replace_attr (p, link->size, fp, quoted_newlink);
|
||||
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
||||
newlink, link->pos, file));
|
||||
xfree (quoted_newlink);
|
||||
++to_url_count;
|
||||
break;
|
||||
}
|
||||
case CO_NULLIFY_BASE:
|
||||
/* Change the base href to "". */
|
||||
p = replace_attr (p, link->size, fp, "");
|
||||
break;
|
||||
case CO_NOCONVERT:
|
||||
abort ();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Output the rest of the file. */
|
||||
if (p - fm->content < fm->length)
|
||||
fwrite (p, 1, fm->length - (p - fm->content), fp);
|
||||
fclose (fp);
|
||||
read_file_free (fm);
|
||||
|
||||
logprintf (LOG_VERBOSE,
|
||||
_("%d-%d\n"), to_file_count, to_url_count);
|
||||
}
|
||||
@ -1960,13 +1976,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
||||
static int find_fragment PARAMS ((const char *, int, const char **,
|
||||
const char **));
|
||||
|
||||
static void
|
||||
replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
|
||||
/* Replace an attribute's original text with NEW_TEXT. */
|
||||
|
||||
static const char *
|
||||
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
|
||||
{
|
||||
const char *p = *pp;
|
||||
int quote_flag = 0;
|
||||
int size = raw_size;
|
||||
char quote_char = '\"';
|
||||
char quote_char = '\"'; /* use "..." for quoting, unless the
|
||||
original value is quoted, in which
|
||||
case reuse its quoting char. */
|
||||
const char *frag_beg, *frag_end;
|
||||
|
||||
/* Structure of our string is:
|
||||
@ -1984,7 +2002,7 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
|
||||
size -= 2; /* disregard opening and closing quote */
|
||||
}
|
||||
putc (quote_char, fp);
|
||||
fputs (new_str, fp);
|
||||
fputs (new_text, fp);
|
||||
|
||||
/* Look for fragment identifier, if any. */
|
||||
if (find_fragment (p, size, &frag_beg, &frag_end))
|
||||
@ -1993,7 +2011,8 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
|
||||
if (quote_flag)
|
||||
++p;
|
||||
putc (quote_char, fp);
|
||||
*pp = p;
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
|
||||
|
@ -65,8 +65,9 @@ enum convert_options {
|
||||
CO_NOCONVERT = 0, /* don't convert this URL */
|
||||
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
|
||||
"../../otherdir/foo.gif" */
|
||||
CO_CONVERT_TO_COMPLETE /* convert to absolute, e.g. to
|
||||
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
|
||||
"http://orighost/somedir/bar.jpg". */
|
||||
CO_NULLIFY_BASE /* change to empty string. */
|
||||
};
|
||||
|
||||
/* A structure that defines the whereabouts of a URL, i.e. its
|
||||
@ -78,10 +79,16 @@ struct urlpos {
|
||||
char *local_name; /* local file to which it was saved
|
||||
(used by convert_links) */
|
||||
|
||||
int ignore_when_downloading; /* reserved for special links such as
|
||||
<base href="..."> which are used
|
||||
when converting links, but ignored
|
||||
when downloading. */
|
||||
|
||||
/* Information about the original link: */
|
||||
int link_relative_p; /* was the link relative? */
|
||||
int link_complete_p; /* was the link complete (with the
|
||||
host name, etc.) */
|
||||
int link_base_p; /* was the link <base href=...> */
|
||||
|
||||
/* Conversion requirements: */
|
||||
enum convert_options convert; /* is conversion required? */
|
||||
|
Loading…
Reference in New Issue
Block a user