mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Added --convert-file-only option
* src/convert.c (convert_links_in_hashtable, convert_links): test for CO_CONVERT_BASENAME_ONLY. (convert_basename): new function. * src/convert.h: new constant CO_CONVERT_BASENAME_ONLY. * src/init.c, src/main.c, src/options.h: new option "--convert-file-only". * doc/wget.texi: updated documentation. Reviewed-by: Gabriel Somlo <somlo@cmu.edu>
This commit is contained in:
parent
3608c3001c
commit
4ad201a7e7
@ -2123,6 +2123,23 @@ Note that only at the end of the download can Wget know which links have
|
|||||||
been downloaded. Because of that, the work done by @samp{-k} will be
|
been downloaded. Because of that, the work done by @samp{-k} will be
|
||||||
performed at the end of all the downloads.
|
performed at the end of all the downloads.
|
||||||
|
|
||||||
|
@item --convert-file-only
|
||||||
|
This option converts only the filename part of the URLs, leaving the rest
|
||||||
|
of the URLs untouched. This filename part is sometimes referred to as the
|
||||||
|
"basename", although we avoid that term here in order not to cause confusion.
|
||||||
|
|
||||||
|
It works particularly well in conjunction with @samp{--adjust-extension}, although
|
||||||
|
this coupling is not enforced. It proves useful to populate Internet caches
|
||||||
|
with files downloaded from different hosts.
|
||||||
|
|
||||||
|
Example: if some link points to @file{//foo.com/bar.cgi?xyz} with
|
||||||
|
@samp{--adjust-extension} asserted and its local destination is intended to be
|
||||||
|
@file{./foo.com/bar.cgi?xyz.css}, then the link would be converted to
|
||||||
|
@file{//foo.com/bar.cgi?xyz.css}. Note that only the filename part has been
|
||||||
|
modified. The rest of the URL has been left untouched, including the net path
|
||||||
|
(@code{//}) which would otherwise be processed by Wget and converted to the
|
||||||
|
effective scheme (ie. @code{http://}).
|
||||||
|
|
||||||
@cindex backing up converted files
|
@cindex backing up converted files
|
||||||
@item -K
|
@item -K
|
||||||
@itemx --backup-converted
|
@itemx --backup-converted
|
||||||
|
@ -46,6 +46,7 @@ as that of the covered work. */
|
|||||||
#include "html-url.h"
|
#include "html-url.h"
|
||||||
#include "css-url.h"
|
#include "css-url.h"
|
||||||
#include "iri.h"
|
#include "iri.h"
|
||||||
|
#include "xstrndup.h"
|
||||||
|
|
||||||
static struct hash_table *dl_file_url_map;
|
static struct hash_table *dl_file_url_map;
|
||||||
struct hash_table *dl_url_file_map;
|
struct hash_table *dl_url_file_map;
|
||||||
@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
|
|||||||
form. We do this even if the URL already is in
|
form. We do this even if the URL already is in
|
||||||
relative form, because our directory structure may
|
relative form, because our directory structure may
|
||||||
not be identical to that on the server (think `-nd',
|
not be identical to that on the server (think `-nd',
|
||||||
`--cut-dirs', etc.) */
|
`--cut-dirs', etc.). If --convert-file-only was passed,
|
||||||
cur_url->convert = CO_CONVERT_TO_RELATIVE;
|
we only convert the basename portion of the URL. */
|
||||||
|
cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
|
||||||
cur_url->local_name = xstrdup (local_name);
|
cur_url->local_name = xstrdup (local_name);
|
||||||
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
|
||||||
}
|
}
|
||||||
@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char *, int, FILE *,
|
|||||||
const char *, int);
|
const char *, int);
|
||||||
static char *local_quote_string (const char *, bool);
|
static char *local_quote_string (const char *, bool);
|
||||||
static char *construct_relative (const char *, const char *);
|
static char *construct_relative (const char *, const char *);
|
||||||
|
static char *convert_basename (const char *, const struct urlpos *);
|
||||||
|
|
||||||
/* Change the links in one file. LINKS is a list of links in the
|
/* Change the links in one file. LINKS is a list of links in the
|
||||||
document, along with their positions and the desired direction of
|
document, along with their positions and the desired direction of
|
||||||
@ -315,9 +318,32 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
|
|
||||||
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
|
||||||
link->url->url, newname, link->pos, file));
|
link->url->url, newname, link->pos, file));
|
||||||
|
|
||||||
xfree (newname);
|
xfree (newname);
|
||||||
xfree (quoted_newname);
|
xfree (quoted_newname);
|
||||||
++to_file_count;
|
++to_file_count;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CO_CONVERT_BASENAME_ONLY:
|
||||||
|
{
|
||||||
|
char *newname = convert_basename (p, link);
|
||||||
|
char *quoted_newname = local_quote_string (newname, link->link_css_p);
|
||||||
|
|
||||||
|
if (link->link_css_p)
|
||||||
|
p = replace_plain (p, link->size, fp, quoted_newname);
|
||||||
|
else if (!link->link_refresh_p)
|
||||||
|
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||||
|
else
|
||||||
|
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
|
||||||
|
link->refresh_timeout);
|
||||||
|
|
||||||
|
DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
|
||||||
|
link->url->url, newname, link->pos, file));
|
||||||
|
|
||||||
|
xfree (newname);
|
||||||
|
xfree (quoted_newname);
|
||||||
|
++to_file_count;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CO_CONVERT_TO_COMPLETE:
|
case CO_CONVERT_TO_COMPLETE:
|
||||||
@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
|
|
||||||
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
|
||||||
newlink, link->pos, file));
|
newlink, link->pos, file));
|
||||||
|
|
||||||
xfree (quoted_newlink);
|
xfree (quoted_newlink);
|
||||||
++to_url_count;
|
++to_url_count;
|
||||||
break;
|
break;
|
||||||
@ -422,6 +449,71 @@ construct_relative (const char *basefile, const char *linkfile)
|
|||||||
return link;
|
return link;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Construct and return a "transparent proxy" URL
|
||||||
|
reflecting changes made by --adjust-extension to the file component
|
||||||
|
(i.e., "basename") of the original URL, but leaving the "dirname"
|
||||||
|
of the URL (protocol://hostname... portion) untouched.
|
||||||
|
|
||||||
|
Think: populating a squid cache via a recursive wget scrape, where
|
||||||
|
changing URLs to work locally with "file://..." is NOT desirable.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
if
|
||||||
|
p = "//foo.com/bar.cgi?xyz"
|
||||||
|
and
|
||||||
|
link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
|
||||||
|
then
|
||||||
|
|
||||||
|
new_construct_func(p, link);
|
||||||
|
will return
|
||||||
|
"//foo.com/bar.cgi?xyz.css"
|
||||||
|
|
||||||
|
Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
|
||||||
|
*/
|
||||||
|
static char *
|
||||||
|
convert_basename (const char *p, const struct urlpos *link)
|
||||||
|
{
|
||||||
|
int len = link->size;
|
||||||
|
char *url = NULL;
|
||||||
|
char *org_basename = NULL, *local_basename = NULL;
|
||||||
|
char *result = NULL;
|
||||||
|
|
||||||
|
if (*p == '"' || *p == '\'')
|
||||||
|
{
|
||||||
|
len -= 2;
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
|
||||||
|
url = xstrndup (p, len);
|
||||||
|
|
||||||
|
org_basename = strrchr (url, '/');
|
||||||
|
if (org_basename)
|
||||||
|
org_basename++;
|
||||||
|
else
|
||||||
|
org_basename = url;
|
||||||
|
|
||||||
|
local_basename = strrchr (link->local_name, '/');
|
||||||
|
if (local_basename)
|
||||||
|
local_basename++;
|
||||||
|
else
|
||||||
|
local_basename = url;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the basenames differ, graft the adjusted basename (local_basename)
|
||||||
|
* onto the original URL.
|
||||||
|
*/
|
||||||
|
if (strcmp (org_basename, local_basename) == 0)
|
||||||
|
result = url;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result = uri_merge (url, local_basename);
|
||||||
|
xfree (url);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/* Used by write_backup_file to remember which files have been
|
/* Used by write_backup_file to remember which files have been
|
||||||
written. */
|
written. */
|
||||||
static struct hash_table *converted_files;
|
static struct hash_table *converted_files;
|
||||||
|
@ -40,6 +40,8 @@ enum convert_options {
|
|||||||
CO_NOCONVERT = 0, /* don't convert this URL */
|
CO_NOCONVERT = 0, /* don't convert this URL */
|
||||||
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
|
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
|
||||||
"../../otherdir/foo.gif" */
|
"../../otherdir/foo.gif" */
|
||||||
|
CO_CONVERT_BASENAME_ONLY, /* convert the file portion only (basename)
|
||||||
|
leaving the rest of the URL unchanged */
|
||||||
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
|
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
|
||||||
"http://orighost/somedir/bar.jpg". */
|
"http://orighost/somedir/bar.jpg". */
|
||||||
CO_NULLIFY_BASE /* change to empty string. */
|
CO_NULLIFY_BASE /* change to empty string. */
|
||||||
|
@ -159,6 +159,7 @@ static const struct {
|
|||||||
{ "contentdisposition", &opt.content_disposition, cmd_boolean },
|
{ "contentdisposition", &opt.content_disposition, cmd_boolean },
|
||||||
{ "contentonerror", &opt.content_on_error, cmd_boolean },
|
{ "contentonerror", &opt.content_on_error, cmd_boolean },
|
||||||
{ "continue", &opt.always_rest, cmd_boolean },
|
{ "continue", &opt.always_rest, cmd_boolean },
|
||||||
|
{ "convertfileonly", &opt.convert_file_only, cmd_boolean },
|
||||||
{ "convertlinks", &opt.convert_links, cmd_boolean },
|
{ "convertlinks", &opt.convert_links, cmd_boolean },
|
||||||
{ "cookies", &opt.cookies, cmd_boolean },
|
{ "cookies", &opt.cookies, cmd_boolean },
|
||||||
#ifdef HAVE_SSL
|
#ifdef HAVE_SSL
|
||||||
@ -377,6 +378,7 @@ defaults (void)
|
|||||||
opt.htmlify = true;
|
opt.htmlify = true;
|
||||||
opt.http_keep_alive = true;
|
opt.http_keep_alive = true;
|
||||||
opt.use_proxy = true;
|
opt.use_proxy = true;
|
||||||
|
opt.convert_file_only = false;
|
||||||
tmp = getenv ("no_proxy");
|
tmp = getenv ("no_proxy");
|
||||||
if (tmp)
|
if (tmp)
|
||||||
opt.no_proxy = sepstring (tmp);
|
opt.no_proxy = sepstring (tmp);
|
||||||
|
24
src/main.c
24
src/main.c
@ -264,6 +264,7 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "config", 0, OPT_VALUE, "chooseconfig", -1 },
|
{ "config", 0, OPT_VALUE, "chooseconfig", -1 },
|
||||||
{ "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
|
{ "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
|
||||||
{ "continue", 'c', OPT_BOOLEAN, "continue", -1 },
|
{ "continue", 'c', OPT_BOOLEAN, "continue", -1 },
|
||||||
|
{ "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 },
|
||||||
{ "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 },
|
{ "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 },
|
||||||
{ "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 },
|
{ "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 },
|
||||||
{ "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
|
{ "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
|
||||||
@ -876,6 +877,8 @@ Recursive download:\n"),
|
|||||||
N_("\
|
N_("\
|
||||||
-k, --convert-links make links in downloaded HTML or CSS point to\n\
|
-k, --convert-links make links in downloaded HTML or CSS point to\n\
|
||||||
local files\n"),
|
local files\n"),
|
||||||
|
N_("\
|
||||||
|
--convert-file-only convert the file part of the URLs only (usually known as the basename)\n"),
|
||||||
N_("\
|
N_("\
|
||||||
--backups=N before writing file X, rotate up to N backup files\n"),
|
--backups=N before writing file X, rotate up to N backup files\n"),
|
||||||
|
|
||||||
@ -1387,11 +1390,14 @@ main (int argc, char **argv)
|
|||||||
/* All user options have now been processed, so it's now safe to do
|
/* All user options have now been processed, so it's now safe to do
|
||||||
interoption dependency checks. */
|
interoption dependency checks. */
|
||||||
|
|
||||||
if (opt.noclobber && opt.convert_links)
|
if (opt.noclobber && (opt.convert_links || opt.convert_file_only))
|
||||||
{
|
{
|
||||||
fprintf (stderr,
|
fprintf (stderr,
|
||||||
_("Both --no-clobber and --convert-links were specified,"
|
opt.convert_links ?
|
||||||
" only --convert-links will be used.\n"));
|
_("Both --no-clobber and --convert-links were specified,"
|
||||||
|
" only --convert-links will be used.\n") :
|
||||||
|
_("Both --no-clobber and --convert-file-only were specified,"
|
||||||
|
" only --convert-file-only will be used.\n"));
|
||||||
opt.noclobber = false;
|
opt.noclobber = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same time.\n"));
|
|||||||
#endif
|
#endif
|
||||||
if (opt.output_document)
|
if (opt.output_document)
|
||||||
{
|
{
|
||||||
if (opt.convert_links
|
if ((opt.convert_links || opt.convert_file_only)
|
||||||
&& (nurl > 1 || opt.page_requisites || opt.recursive))
|
&& (nurl > 1 || opt.page_requisites || opt.recursive))
|
||||||
{
|
{
|
||||||
fputs (_("\
|
fputs (_("\
|
||||||
Cannot specify both -k and -O if multiple URLs are given, or in combination\n\
|
Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\
|
||||||
with -p or -r. See the manual for details.\n\n"), stderr);
|
with -p or -r. See the manual for details.\n\n"), stderr);
|
||||||
print_usage (1);
|
print_usage (1);
|
||||||
exit (WGET_EXIT_GENERIC_ERROR);
|
exit (WGET_EXIT_GENERIC_ERROR);
|
||||||
@ -1760,6 +1766,12 @@ for details.\n\n"));
|
|||||||
outputting to a regular file.\n"));
|
outputting to a regular file.\n"));
|
||||||
exit (WGET_EXIT_GENERIC_ERROR);
|
exit (WGET_EXIT_GENERIC_ERROR);
|
||||||
}
|
}
|
||||||
|
if (!output_stream_regular && (opt.convert_links || opt.convert_file_only))
|
||||||
|
{
|
||||||
|
fprintf (stderr, _("--convert-links or --convert-file-only can be used together \
|
||||||
|
only if outputting to a regular file.\n"));
|
||||||
|
exit (WGET_EXIT_GENERIC_ERROR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __VMS
|
#ifdef __VMS
|
||||||
@ -1970,7 +1982,7 @@ outputting to a regular file.\n"));
|
|||||||
save_hsts ();
|
save_hsts ();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (opt.convert_links && !opt.delete_after)
|
if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after)
|
||||||
convert_all_links ();
|
convert_all_links ();
|
||||||
|
|
||||||
cleanup ();
|
cleanup ();
|
||||||
|
@ -182,6 +182,9 @@ struct options
|
|||||||
NULL. */
|
NULL. */
|
||||||
bool convert_links; /* Will the links be converted
|
bool convert_links; /* Will the links be converted
|
||||||
locally? */
|
locally? */
|
||||||
|
bool convert_file_only; /* Convert only the file portion of the URI (i.e. basename).
|
||||||
|
Leave everything else untouched. */
|
||||||
|
|
||||||
bool remove_listing; /* Do we remove .listing files
|
bool remove_listing; /* Do we remove .listing files
|
||||||
generated by FTP? */
|
generated by FTP? */
|
||||||
bool htmlify; /* Do we HTML-ify the OS-dependent
|
bool htmlify; /* Do we HTML-ify the OS-dependent
|
||||||
|
Loading…
Reference in New Issue
Block a user