1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Handle <base href=...> when converting links.

Published in <sxsadxaae3t.fsf@florida.arsdigita.de>.
This commit is contained in:
hniksic 2001-11-25 10:40:55 -08:00
parent 2e6e3f21f8
commit df05e7ff10
8 changed files with 115 additions and 44 deletions

19
NEWS
View File

@ -7,13 +7,24 @@ Please send GNU Wget bug reports to <bug-wget@gnu.org>.
* Changes in Wget 1.8.
** "Recursive retrieval" now uses a breadth-first algorithm.
Recursive downloads are faster and consume *significantly* less memory
than before.
** A new progress indicator is now available. Try it with
--progress=bar or using `progress = bar' in `.wgetrc'.
** "Recursive retrieval" has been revamped:
*** Wget now traverses links breadth-first. This makes the
calculation of depth much more reliable than before. Also, recursive
downloads are faster and consume *significantly* less memory than
before.
*** Links are converted only when the entire retrieval is complete.
This is the only safe thing to do, as only then is it known what URLs
have been downloaded.
*** BASE tags are handled correctly when converting links. Since Wget
already resolves <base href="..."> when resolving handling URLs, link
conversion now makes the BASE tags point to an empty string.
** Host directories now contain port information if the URL is at a
non-standard port.

2
TODO
View File

@ -55,8 +55,6 @@ changes.
* Make -K compare X.orig to X and move the former on top of the latter if
they're the same, rather than leaving identical .orig files laying around.
* Make `-k' convert <base href=...> too.
* Make `-k' check for files that were downloaded in the past and convert links
to them in newly-downloaded documents.

View File

@ -1,3 +1,14 @@
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Handle CO_NULLIFY_BASE.
* recur.c (retrieve_tree): Ignore download-ignorable children.
(convert_all_links): Specify CO_NULLIFY_BASE when link_base_p.
* html-url.c (handle_link): Return the newly created urlpos.
(collect_tags_mapper): When dealing with BASE, store the base
reference and mark it as download-ignorable.
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Attempt to quote '?' as "%3F" when

View File

@ -297,7 +297,7 @@ struct collect_urls_closure {
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
are the necessary context to store the position and size. */
static void
static struct urlpos *
handle_link (struct collect_urls_closure *closure, const char *link_uri,
struct taginfo *tag, int attrid)
{
@ -316,7 +316,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
/* We have no base, and the link does not have a host
attached to it. Nothing we can do. */
/* #### Should we print a warning here? Wget 1.5.x used to. */
return;
return NULL;
}
url = url_parse (link_uri, NULL);
@ -324,7 +324,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
closure->document_file, link_uri));
return;
return NULL;
}
}
else
@ -344,7 +344,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
closure->document_file, complete_uri));
xfree (complete_uri);
return;
return NULL;
}
xfree (complete_uri);
}
@ -371,6 +371,8 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
}
else
closure->tail = closure->head = newel;
return newel;
}
/* Examine name and attributes of TAG and take appropriate action.
@ -435,9 +437,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
{
case TAG_BASE:
{
char *newbase = find_attr (tag, "href", NULL);
struct urlpos *base_urlpos;
int id;
char *newbase = find_attr (tag, "href", &id);
if (!newbase)
break;
base_urlpos = handle_link (closure, newbase, tag, id);
if (!base_urlpos)
break;
base_urlpos->ignore_when_downloading = 1;
base_urlpos->link_base_p = 1;
if (closure->base)
xfree (closure->base);
if (closure->parent_base)
@ -545,13 +556,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
}
/* Analyze HTML tags FILE and construct a list of URLs referenced from
it. It merges relative links in FILE with THIS_URL. It is aware
of <base href=...> and does the right thing.
it. It merges relative links in FILE with URL. It is aware of
<base href=...> and does the right thing.
If dash_p_leaf_HTML is non-zero, only the elements needed to render
FILE ("non-external" links) will be returned. */
struct urlpos *
get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
int *meta_disallow_follow)
{
struct file_memory *fm;
@ -569,7 +580,7 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
closure.text = fm->content;
closure.head = closure.tail = NULL;
closure.base = NULL;
closure.parent_base = this_url ? this_url : opt.base_href;
closure.parent_base = url ? url : opt.base_href;
closure.document_file = file;
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0;

View File

@ -280,6 +280,8 @@ retrieve_tree (const char *start_url)
for (; child; child = child->next)
{
if (child->ignore_when_downloading)
continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist))
{
@ -634,6 +636,15 @@ convert_all_links (void)
char *local_name;
struct url *u = cur_url->url;
if (cur_url->link_base_p)
{
/* Base references have been resolved by our parser, so
we turn the base URL into an empty string. (Perhaps
we should remove the tag entirely?) */
cur_url->convert = CO_NULLIFY_BASE;
continue;
}
/* We decide the direction of conversion according to whether
a URL was downloaded. Downloaded URLs will be converted
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
@ -642,7 +653,7 @@ convert_all_links (void)
DEBUGP (("%s marked for conversion, local %s\n",
u->url, local_name));
/* Decide on the conversion direction. */
/* Decide on the conversion type. */
if (local_name)
{
/* We've downloaded this URL. Convert it to relative

View File

@ -462,6 +462,9 @@ retrieve_from_file (const char *file, int html, int *count)
char *filename = NULL, *new_file;
int dt;
if (cur_url->ignore_when_downloading)
continue;
if (downloaded_exceeds_quota ())
{
status = QUOTEXC;

View File

@ -1698,7 +1698,7 @@ no_proxy_match (const char *host, const char **no_proxy)
}
static void write_backup_file PARAMS ((const char *, downloaded_file_t));
static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
static char *local_quote_string PARAMS ((const char *));
/* Change the links in one HTML file. LINKS is a list of links in the
@ -1765,6 +1765,7 @@ convert_links (const char *file, struct urlpos *links)
read_file_free (fm);
return;
}
/* Here we loop through all the URLs in file, replacing those of
them that are downloaded with relative references. */
p = fm->content;
@ -1788,35 +1789,50 @@ convert_links (const char *file, struct urlpos *links)
quote, to the outfile. */
fwrite (p, 1, url_start - p, fp);
p = url_start;
if (link->convert == CO_CONVERT_TO_RELATIVE)
switch (link->convert)
{
case CO_CONVERT_TO_RELATIVE:
/* Convert absolute URL to relative. */
char *newname = construct_relative (file, link->local_name);
char *quoted_newname = local_quote_string (newname);
replace_attr (&p, link->size, fp, quoted_newname);
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
link->url->url, newname, link->pos, file));
xfree (newname);
xfree (quoted_newname);
++to_file_count;
}
else if (link->convert == CO_CONVERT_TO_COMPLETE)
{
{
char *newname = construct_relative (file, link->local_name);
char *quoted_newname = local_quote_string (newname);
p = replace_attr (p, link->size, fp, quoted_newname);
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
link->url->url, newname, link->pos, file));
xfree (newname);
xfree (quoted_newname);
++to_file_count;
break;
}
case CO_CONVERT_TO_COMPLETE:
/* Convert the link to absolute URL. */
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
replace_attr (&p, link->size, fp, quoted_newlink);
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
newlink, link->pos, file));
xfree (quoted_newlink);
++to_url_count;
{
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
p = replace_attr (p, link->size, fp, quoted_newlink);
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
newlink, link->pos, file));
xfree (quoted_newlink);
++to_url_count;
break;
}
case CO_NULLIFY_BASE:
/* Change the base href to "". */
p = replace_attr (p, link->size, fp, "");
break;
case CO_NOCONVERT:
abort ();
break;
}
}
/* Output the rest of the file. */
if (p - fm->content < fm->length)
fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
read_file_free (fm);
logprintf (LOG_VERBOSE,
_("%d-%d\n"), to_file_count, to_url_count);
}
@ -1960,13 +1976,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
static int find_fragment PARAMS ((const char *, int, const char **,
const char **));
static void
replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
/* Replace an attribute's original text with NEW_TEXT. */
static const char *
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
{
const char *p = *pp;
int quote_flag = 0;
int size = raw_size;
char quote_char = '\"';
char quote_char = '\"'; /* use "..." for quoting, unless the
original value is quoted, in which
case reuse its quoting char. */
const char *frag_beg, *frag_end;
/* Structure of our string is:
@ -1984,7 +2002,7 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
size -= 2; /* disregard opening and closing quote */
}
putc (quote_char, fp);
fputs (new_str, fp);
fputs (new_text, fp);
/* Look for fragment identifier, if any. */
if (find_fragment (p, size, &frag_beg, &frag_end))
@ -1993,7 +2011,8 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
if (quote_flag)
++p;
putc (quote_char, fp);
*pp = p;
return p;
}
/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not

View File

@ -65,8 +65,9 @@ enum convert_options {
CO_NOCONVERT = 0, /* don't convert this URL */
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
"../../otherdir/foo.gif" */
CO_CONVERT_TO_COMPLETE /* convert to absolute, e.g. to
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
"http://orighost/somedir/bar.jpg". */
CO_NULLIFY_BASE /* change to empty string. */
};
/* A structure that defines the whereabouts of a URL, i.e. its
@ -78,10 +79,16 @@ struct urlpos {
char *local_name; /* local file to which it was saved
(used by convert_links) */
int ignore_when_downloading; /* reserved for special links such as
<base href="..."> which are used
when converting links, but ignored
when downloading. */
/* Information about the original link: */
int link_relative_p; /* was the link relative? */
int link_complete_p; /* was the link complete (with the
host name, etc.) */
int link_base_p; /* was the link <base href=...> */
/* Conversion requirements: */
enum convert_options convert; /* is conversion required? */