diff --git a/NEWS b/NEWS index 8a3b0b70..d9849ef0 100644 --- a/NEWS +++ b/NEWS @@ -7,13 +7,24 @@ Please send GNU Wget bug reports to . * Changes in Wget 1.8. -** "Recursive retrieval" now uses a breadth-first algorithm. -Recursive downloads are faster and consume *significantly* less memory -than before. - ** A new progress indicator is now available. Try it with --progress=bar or using `progress = bar' in `.wgetrc'. +** "Recursive retrieval" has been revamped: + +*** Wget now traverses links breadth-first. This makes the +calculation of depth much more reliable than before. Also, recursive +downloads are faster and consume *significantly* less memory than +before. + +*** Links are converted only when the entire retrieval is complete. +This is the only safe thing to do, as only then is it known what URLs +have been downloaded. + +*** BASE tags are handled correctly when converting links. Since Wget +already resolves when resolving handling URLs, link +conversion now makes the BASE tags point to an empty string. + ** Host directories now contain port information if the URL is at a non-standard port. diff --git a/TODO b/TODO index a9cb902e..83ad664d 100644 --- a/TODO +++ b/TODO @@ -55,8 +55,6 @@ changes. * Make -K compare X.orig to X and move the former on top of the latter if they're the same, rather than leaving identical .orig files laying around. -* Make `-k' convert too. - * Make `-k' check for files that were downloaded in the past and convert links to them in newly-downloaded documents. diff --git a/src/ChangeLog b/src/ChangeLog index 58ed40e7..6d2bbcf1 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,14 @@ +2001-11-25 Hrvoje Niksic + + * url.c (convert_links): Handle CO_NULLIFY_BASE. + + * recur.c (retrieve_tree): Ignore download-ignorable children. + (convert_all_links): Specify CO_NULLIFY_BASE when link_base_p. + + * html-url.c (handle_link): Return the newly created urlpos. + (collect_tags_mapper): When dealing with BASE, store the base + reference and mark it as download-ignorable. + 2001-11-25 Hrvoje Niksic * url.c (convert_links): Attempt to quote '?' as "%3F" when diff --git a/src/html-url.c b/src/html-url.c index 91877820..051f5057 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -297,7 +297,7 @@ struct collect_urls_closure { /* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID are the necessary context to store the position and size. */ -static void +static struct urlpos * handle_link (struct collect_urls_closure *closure, const char *link_uri, struct taginfo *tag, int attrid) { @@ -316,7 +316,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, /* We have no base, and the link does not have a host attached to it. Nothing we can do. */ /* #### Should we print a warning here? Wget 1.5.x used to. */ - return; + return NULL; } url = url_parse (link_uri, NULL); @@ -324,7 +324,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, { DEBUGP (("%s: link \"%s\" doesn't parse.\n", closure->document_file, link_uri)); - return; + return NULL; } } else @@ -344,7 +344,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", closure->document_file, complete_uri)); xfree (complete_uri); - return; + return NULL; } xfree (complete_uri); } @@ -371,6 +371,8 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, } else closure->tail = closure->head = newel; + + return newel; } /* Examine name and attributes of TAG and take appropriate action. @@ -435,9 +437,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg) { case TAG_BASE: { - char *newbase = find_attr (tag, "href", NULL); + struct urlpos *base_urlpos; + int id; + char *newbase = find_attr (tag, "href", &id); if (!newbase) break; + + base_urlpos = handle_link (closure, newbase, tag, id); + if (!base_urlpos) + break; + base_urlpos->ignore_when_downloading = 1; + base_urlpos->link_base_p = 1; + if (closure->base) xfree (closure->base); if (closure->parent_base) @@ -545,13 +556,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg) } /* Analyze HTML tags FILE and construct a list of URLs referenced from - it. It merges relative links in FILE with THIS_URL. It is aware - of and does the right thing. + it. It merges relative links in FILE with URL. It is aware of + and does the right thing. If dash_p_leaf_HTML is non-zero, only the elements needed to render FILE ("non-external" links) will be returned. */ struct urlpos * -get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML, +get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML, int *meta_disallow_follow) { struct file_memory *fm; @@ -569,7 +580,7 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML, closure.text = fm->content; closure.head = closure.tail = NULL; closure.base = NULL; - closure.parent_base = this_url ? this_url : opt.base_href; + closure.parent_base = url ? url : opt.base_href; closure.document_file = file; closure.dash_p_leaf_HTML = dash_p_leaf_HTML; closure.nofollow = 0; diff --git a/src/recur.c b/src/recur.c index 2c261579..7af7d050 100644 --- a/src/recur.c +++ b/src/recur.c @@ -280,6 +280,8 @@ retrieve_tree (const char *start_url) for (; child; child = child->next) { + if (child->ignore_when_downloading) + continue; if (descend_url_p (child, url_parsed, depth, start_url_parsed, blacklist)) { @@ -634,6 +636,15 @@ convert_all_links (void) char *local_name; struct url *u = cur_url->url; + if (cur_url->link_base_p) + { + /* Base references have been resolved by our parser, so + we turn the base URL into an empty string. (Perhaps + we should remove the tag entirely?) */ + cur_url->convert = CO_NULLIFY_BASE; + continue; + } + /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted ABS2REL, whereas non-downloaded will be converted REL2ABS. */ @@ -642,7 +653,7 @@ convert_all_links (void) DEBUGP (("%s marked for conversion, local %s\n", u->url, local_name)); - /* Decide on the conversion direction. */ + /* Decide on the conversion type. */ if (local_name) { /* We've downloaded this URL. Convert it to relative diff --git a/src/retr.c b/src/retr.c index d613da7c..07d37d8b 100644 --- a/src/retr.c +++ b/src/retr.c @@ -462,6 +462,9 @@ retrieve_from_file (const char *file, int html, int *count) char *filename = NULL, *new_file; int dt; + if (cur_url->ignore_when_downloading) + continue; + if (downloaded_exceeds_quota ()) { status = QUOTEXC; diff --git a/src/url.c b/src/url.c index 26642e59..1ce7222a 100644 --- a/src/url.c +++ b/src/url.c @@ -1698,7 +1698,7 @@ no_proxy_match (const char *host, const char **no_proxy) } static void write_backup_file PARAMS ((const char *, downloaded_file_t)); -static void replace_attr PARAMS ((const char **, int, FILE *, const char *)); +static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *)); static char *local_quote_string PARAMS ((const char *)); /* Change the links in one HTML file. LINKS is a list of links in the @@ -1765,6 +1765,7 @@ convert_links (const char *file, struct urlpos *links) read_file_free (fm); return; } + /* Here we loop through all the URLs in file, replacing those of them that are downloaded with relative references. */ p = fm->content; @@ -1788,35 +1789,50 @@ convert_links (const char *file, struct urlpos *links) quote, to the outfile. */ fwrite (p, 1, url_start - p, fp); p = url_start; - if (link->convert == CO_CONVERT_TO_RELATIVE) + + switch (link->convert) { + case CO_CONVERT_TO_RELATIVE: /* Convert absolute URL to relative. */ - char *newname = construct_relative (file, link->local_name); - char *quoted_newname = local_quote_string (newname); - replace_attr (&p, link->size, fp, quoted_newname); - DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", - link->url->url, newname, link->pos, file)); - xfree (newname); - xfree (quoted_newname); - ++to_file_count; - } - else if (link->convert == CO_CONVERT_TO_COMPLETE) - { + { + char *newname = construct_relative (file, link->local_name); + char *quoted_newname = local_quote_string (newname); + p = replace_attr (p, link->size, fp, quoted_newname); + DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", + link->url->url, newname, link->pos, file)); + xfree (newname); + xfree (quoted_newname); + ++to_file_count; + break; + } + case CO_CONVERT_TO_COMPLETE: /* Convert the link to absolute URL. */ - char *newlink = link->url->url; - char *quoted_newlink = html_quote_string (newlink); - replace_attr (&p, link->size, fp, quoted_newlink); - DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", - newlink, link->pos, file)); - xfree (quoted_newlink); - ++to_url_count; + { + char *newlink = link->url->url; + char *quoted_newlink = html_quote_string (newlink); + p = replace_attr (p, link->size, fp, quoted_newlink); + DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", + newlink, link->pos, file)); + xfree (quoted_newlink); + ++to_url_count; + break; + } + case CO_NULLIFY_BASE: + /* Change the base href to "". */ + p = replace_attr (p, link->size, fp, ""); + break; + case CO_NOCONVERT: + abort (); + break; } } + /* Output the rest of the file. */ if (p - fm->content < fm->length) fwrite (p, 1, fm->length - (p - fm->content), fp); fclose (fp); read_file_free (fm); + logprintf (LOG_VERBOSE, _("%d-%d\n"), to_file_count, to_url_count); } @@ -1960,13 +1976,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) static int find_fragment PARAMS ((const char *, int, const char **, const char **)); -static void -replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str) +/* Replace an attribute's original text with NEW_TEXT. */ + +static const char * +replace_attr (const char *p, int size, FILE *fp, const char *new_text) { - const char *p = *pp; int quote_flag = 0; - int size = raw_size; - char quote_char = '\"'; + char quote_char = '\"'; /* use "..." for quoting, unless the + original value is quoted, in which + case reuse its quoting char. */ const char *frag_beg, *frag_end; /* Structure of our string is: @@ -1984,7 +2002,7 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str) size -= 2; /* disregard opening and closing quote */ } putc (quote_char, fp); - fputs (new_str, fp); + fputs (new_text, fp); /* Look for fragment identifier, if any. */ if (find_fragment (p, size, &frag_beg, &frag_end)) @@ -1993,7 +2011,8 @@ replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str) if (quote_flag) ++p; putc (quote_char, fp); - *pp = p; + + return p; } /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not diff --git a/src/url.h b/src/url.h index aed9bc38..836cdad1 100644 --- a/src/url.h +++ b/src/url.h @@ -65,8 +65,9 @@ enum convert_options { CO_NOCONVERT = 0, /* don't convert this URL */ CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to "../../otherdir/foo.gif" */ - CO_CONVERT_TO_COMPLETE /* convert to absolute, e.g. to + CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to "http://orighost/somedir/bar.jpg". */ + CO_NULLIFY_BASE /* change to empty string. */ }; /* A structure that defines the whereabouts of a URL, i.e. its @@ -78,10 +79,16 @@ struct urlpos { char *local_name; /* local file to which it was saved (used by convert_links) */ + int ignore_when_downloading; /* reserved for special links such as + which are used + when converting links, but ignored + when downloading. */ + /* Information about the original link: */ int link_relative_p; /* was the link relative? */ int link_complete_p; /* was the link complete (with the host name, etc.) */ + int link_base_p; /* was the link */ /* Conversion requirements: */ enum convert_options convert; /* is conversion required? */