From e6b4e761d1f1439b1b2352f5eeaedd1ae5b9d76e Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 17:42:16 +0200 Subject: [PATCH 1/3] Don't forget to free the iri struct --- src/retr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/retr.c b/src/retr.c index fe176eaf..4731d9ee 100644 --- a/src/retr.c +++ b/src/retr.c @@ -928,6 +928,8 @@ Removing file due to --delete-after in retrieve_from_file():\n")); /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } From 723dbfc818e3e5b22ec53fd093dca999290ebead Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 18:26:53 +0200 Subject: [PATCH 2/3] Correct iri handling while fetching a remote file list with -i and provide a test --- src/main.c | 2 +- src/recur.c | 15 +++- src/recur.h | 2 +- src/retr.c | 8 +- tests/Test-iri-list.px | 173 +++++++++++++++++++++++++++++++++++++++++ tests/run-px | 1 + 6 files changed, 195 insertions(+), 6 deletions(-) create mode 100755 tests/Test-iri-list.px diff --git a/src/main.c b/src/main.c index 79c35220..8d8d93fa 100644 --- a/src/main.c +++ b/src/main.c @@ -1196,7 +1196,7 @@ WARNING: Can't reopen standard output in binary mode;\n\ if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (*t); + status = retrieve_tree (*t, NULL); opt.follow_ftp = old_follow_ftp; } diff --git a/src/recur.c b/src/recur.c index 71fbe7bf..921c60c7 100644 --- a/src/recur.c +++ b/src/recur.c @@ -187,7 +187,7 @@ static bool descend_redirect_p (const char *, const char *, int, options, add it to the queue. */ uerr_t -retrieve_tree (const char *start_url) +retrieve_tree (const char *start_url, struct iri *pi) { uerr_t status = RETROK; @@ -201,7 +201,18 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale, true); + +#define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); +#undef COPYSTR start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) diff --git a/src/recur.h b/src/recur.h index 5ab26a95..515a382b 100644 --- a/src/recur.h +++ b/src/recur.h @@ -42,6 +42,6 @@ as that of the covered work. */ struct urlpos; void recursive_cleanup (void); -uerr_t retrieve_tree (const char *); +uerr_t retrieve_tree (const char *, struct iri *); #endif /* RECUR_H */ diff --git a/src/retr.c b/src/retr.c index 4731d9ee..963d5044 100644 --- a/src/retr.c +++ b/src/retr.c @@ -651,7 +651,6 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { - /* sXXXav : could a proxy include a path ??? */ struct iri *pi = iri_new (); set_uri_encoding (pi, opt.locale, true); pi->utf8_encode = false; @@ -858,6 +857,7 @@ retrieve_from_file (const char *file, bool html, int *count) *count = 0; /* Reset the URL count. */ /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); set_content_encoding (iri, opt.locale); if (url_has_scheme (url)) @@ -894,6 +894,10 @@ retrieve_from_file (const char *file, bool html, int *count) status = QUOTEXC; break; } + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { @@ -903,7 +907,7 @@ retrieve_from_file (const char *file, bool html, int *count) if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - status = retrieve_tree (cur_url->url->url); + status = retrieve_tree (cur_url->url->url, iri); opt.follow_ftp = old_follow_ftp; } diff --git a/tests/Test-iri-list.px b/tests/Test-iri-list.px new file mode 100755 index 00000000..51bb09fe --- /dev/null +++ b/tests/Test-iri-list.px @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w + +use strict; + +use HTTPTest; + +# cf. http://en.wikipedia.org/wiki/Latin1 +# http://en.wikipedia.org/wiki/ISO-8859-15 +############################################################################### +# +# mime : charset found in Content-Type HTTP MIME header +# meta : charset found in Content-Type meta tag +# +# index.html mime + file = iso-8859-15 +# p1_français.html meta + file = iso-8859-1, mime = utf-8 +# p2_één.html meta + file = utf-8, mime =iso-8859-1 +# + +my $ccedilla_l1 = "\xE7"; +my $ccedilla_u8 = "\xC3\xA7"; +my $eacute_l1 = "\xE9"; +my $eacute_u8 = "\xC3\xA9"; + +my $urllist = < + + Main Page + + +

+ Main page. +

+ + +EOF + +my $pagefrancais = < + + La seule page en français + + + +

+ French page. +

+ + +EOF + +my $pageeen = < + + Die enkele nederlandstalige pagina + + + +

+ Dutch page. +

+ + +EOF + +my $page404 = < + + 404 + + +

+ Nop nop nop... +

+ + +EOF + +# code, msg, headers, content +my %urls = ( + '/index.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-15", + }, + content => $pageindex, + }, + '/robots.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain", + }, + content => "", + }, + '/p1_fran%C3%A7ais.html' => { # UTF-8 encoded + code => "404", + msg => "File not found", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $page404, + }, + '/p1_fran%E7ais.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=UTF-8", + }, + content => $pagefrancais, + }, + '/p2_%C3%A9%C3%A9n.html' => { # UTF-8 encoded + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/p2_%E9%E9n.html' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/html; charset=ISO-8859-1", + }, + content => $pageeen, + }, + '/url_list.txt' => { + code => "200", + msg => "Ok", + headers => { + "Content-type" => "text/plain; charset=ISO-8859-1", + }, + content => $urllist, + }, +); + +my $cmdline = $WgetTest::WGETPATH . " --iri -d -i http://localhost:{{port}}/url_list.txt"; + +my $expected_error_code = 0; + +my %expected_downloaded_files = ( + 'url_list.txt' => { + content => $urllist, + }, + 'index.html' => { + content => $pageindex, + }, + "p1_fran${ccedilla_l1}ais.html" => { + content => $pagefrancais, + }, + "p2_${eacute_u8}${eacute_u8}n.html" => { + content => $pageeen, + }, +); + +############################################################################### + +my $the_test = HTTPTest->new (name => "Test-iri-list", + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, + output => \%expected_downloaded_files); +exit $the_test->run(); + +# vim: et ts=4 sw=4 + diff --git a/tests/run-px b/tests/run-px index 172adcd7..51dec828 100755 --- a/tests/run-px +++ b/tests/run-px @@ -25,6 +25,7 @@ my @tests = ( 'Test-iri.px', 'Test-iri-disabled.px', 'Test-iri-forced-remote.px', + 'Test-iri-list.px', 'Test-N-current.px', 'Test-N-smaller.px', 'Test-N-no-info.px', From a5c222fa798673319e930e944d8d59cd906361fc Mon Sep 17 00:00:00 2001 From: Xavier Saint Date: Thu, 14 Aug 2008 18:31:03 +0200 Subject: [PATCH 3/3] Update tests/Changelog for Test-iri-list.px --- tests/ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ChangeLog b/tests/ChangeLog index ad18c14a..f2179763 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,7 @@ +2008-08-14 Xavier Saint + + * Test-iri-list.px : Fetch files from a remote list. + 2008-08-03 Xavier Saint * Test-iri.px : HTTP recursive fetch for testing IRI support and