diff --git a/src/main.c b/src/main.c index 79c35220..8d8d93fa 100644 --- a/src/main.c +++ b/src/main.c @@ -1196,7 +1196,7 @@ WARNING: Can't reopen standard output in binary mode;\n\ if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (*t); + status = retrieve_tree (*t, NULL); opt.follow_ftp = old_follow_ftp; } diff --git a/src/recur.c b/src/recur.c index 71fbe7bf..921c60c7 100644 --- a/src/recur.c +++ b/src/recur.c @@ -187,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int, options, add it to the queue. */ uerr_t -retrieve_tree (const char *start_url) +retrieve_tree (const char *start_url, struct iri *pi) { uerr_t status = RETROK; @@ -201,7 +201,18 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale, true); + +#define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); +#undef COPYSTR start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) diff --git a/src/recur.h b/src/recur.h index 5ab26a95..515a382b 100644 --- a/src/recur.h +++ b/src/recur.h @@ -42,6 +42,6 @@ as that of the covered work. */ struct urlpos; void recursive_cleanup (void); -uerr_t retrieve_tree (const char *); +uerr_t retrieve_tree (const char *, struct iri *); #endif /* RECUR_H */ diff --git a/src/retr.c b/src/retr.c index 4731d9ee..963d5044 100644 --- a/src/retr.c +++ b/src/retr.c @@ -651,7 +651,6 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { - /* sXXXav : could a proxy include a path ??? */ struct iri *pi = iri_new (); set_uri_encoding (pi, opt.locale, true); pi->utf8_encode = false; @@ -858,6 +857,7 @@ retrieve_from_file (const char *file, bool html, int *count) *count = 0; /* Reset the URL count. */ /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); set_content_encoding (iri, opt.locale); if (url_has_scheme (url)) @@ -894,6 +894,10 @@ retrieve_from_file (const char *file, bool html, int *count) status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { @@ -903,7 +907,7 @@ retrieve_from_file (const char *file, bool html, int *count) if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (cur_url->url->url); + status = retrieve_tree (cur_url->url->url, iri); opt.follow_ftp = old_follow_ftp; } diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px new file mode 100755 index 00000000..51bb09fe --- /dev/null +++ b/tests/Test-iri-list.px @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; + +my $urllist = < + + Main Page + + +

+ Main page. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ French page. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Dutch page. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/url_list.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $urllist, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'url_list.txt' => { + content => $urllist, + }, + 'index.html' => { + content => $pageindex, + }, + "p1_fran${ccedilla_l1}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-list", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 172adcd7..51dec828 100755 --- a/tests/run-px +++ b/tests/run-px @@ -25,6 +25,7 @@ my @tests = ( 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px',