Change global variable model for state-object

2024-07-03 16:38:41 -04:00 · 2008-07-24 00:56:29 +02:00 · 2008-07-24 00:56:29 +02:00 · d82f80ecab
commit d82f80ecab
parent c31e00b52d
16 changed files with 197 additions and 231 deletions
--- a/src/convert.c
+++ b/src/convert.c
@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,

      /* Parse the file...  */
      urls = is_css ? get_urls_css_file (file, url) :
-                      get_urls_html (file, url, NULL);
+                      get_urls_html (file, url, NULL, NULL);

      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
--- a/src/html-url.c
+++ b/src/html-url.c
@ -44,7 +44,6 @@ as that of the covered work.  */
 #include "recur.h"
 #include "html-url.h"
 #include "css-url.h"
-#include "iri.h"

 typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);

@ -175,6 +174,10 @@ static const char *additional_attributes[] = {
 static struct hash_table *interesting_tags;
 static struct hash_table *interesting_attributes;

+/* Will contains the (last) charset found in 'http-equiv=content-type'
+   meta tags  */
+static char *meta_charset;
+
 static void
 init_interesting (void)
 {
@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size,
          return NULL;
        }

-      set_ugly_no_encode (true);
-      url = url_parse (link_uri, NULL);
-      set_ugly_no_encode (false);
+      url = url_parse (link_uri, NULL, NULL);
      if (!url)
        {
          DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size,
      DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
               ctx->document_file, base, link_uri, complete_uri));

-      set_ugly_no_encode (true);
-      url = url_parse (complete_uri, NULL);
-      set_ugly_no_encode (false);
+      url = url_parse (complete_uri, NULL, NULL);
      if (!url)
        {
          DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
        return;

      /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
-
-      set_current_charset (mcharset);
-      xfree (mcharset);
+      xfree_null (meta_charset);
+      meta_charset = mcharset;
    }
  else if (name && 0 == strcasecmp (name, "robots"))
    {
@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
   <base href=...> and does the right thing.  */

 struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+               struct iri *iri)
 {
  struct file_memory *fm;
  struct map_context ctx;
@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                 NULL, interesting_attributes);

+  /* If meta charset isn't null, override content encoding */
+  if (iri && meta_charset)
+    set_content_encoding (iri, meta_charset);
+
  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
  if (meta_disallow_follow)
    *meta_disallow_follow = ctx.nofollow;
@ -750,9 +753,7 @@ get_urls_file (const char *file)
          url_text = merged;
        }

-      set_ugly_no_encode (true);
-      url = url_parse (url_text, &up_error_code);
-      set_ugly_no_encode (false);
+      url = url_parse (url_text, &up_error_code, NULL);
      if (!url)
        {
          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
--- a/src/html-url.h
+++ b/src/html-url.h
@ -44,7 +44,7 @@ struct map_context {
 };

 struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
 struct urlpos *append_url (const char *, int, int, struct map_context *);
 void free_urlpos (struct urlpos *);

--- a/src/http.c
+++ b/src/http.c
@ -49,7 +49,6 @@ as that of the covered work.  */
 #include "retr.h"
 #include "connect.h"
 #include "netrc.h"
-#include "iri.h"
 #ifdef HAVE_SSL
 # include "ssl.h"
 #endif
@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs)
   If PROXY is non-NULL, the connection will be made to the proxy
   server, and u->url will be requested.  */
 static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
+gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
+         struct iri *iri)
 {
  struct request *req;

@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));

          /* Try to get remote encoding if needed */
          if (opt.enable_iri && !opt.encoding_remote)
-            set_current_charset (parse_charset (tmp2));
+            {
+              tmp = parse_charset (tmp2);
+              if (tmp)
+                set_content_encoding (iri, tmp);
+            }
        }
    }
  hs->newloc = resp_header_strdup (resp, "Location");
@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file));
   retried, and retried, and retried, and...  */
 uerr_t
 http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
-           int *dt, struct url *proxy)
+           int *dt, struct url *proxy, struct iri *iri)
 {
  int count;
  bool got_head = false;         /* used for time-stamping and filename detection */
@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n"));
        *dt &= ~SEND_NOCACHE;

      /* Try fetching the document, or at least its head.  */
-      err = gethttp (u, &hstat, dt, proxy);
+      err = gethttp (u, &hstat, dt, proxy, iri);

      /* Time?  */
      tms = datetime_str (time (NULL));
@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n"));
            }
          /* Maybe we should always keep track of broken links, not just in
           * spider mode.
-           * Don't log error if it was utf8 encoded because we will try
-           * one unencoded. */
-          else if (opt.spider && !get_utf8_encode ())
+           * Don't log error if it was UTF-8 encoded because we will try
+           * once unencoded. */
+          else if (opt.spider && !iri->utf8_encode)
            {
              /* #### Again: ugly ugly ugly! */
              if (!hurl)
--- a/src/http.h
+++ b/src/http.h
@ -33,7 +33,7 @@ as that of the covered work.  */
 struct url;

 uerr_t http_loop (struct url *, char **, char **, const char *, int *,
-		  struct url *);
+		  struct url *, struct iri *);
 void save_cookies (void);
 void http_cleanup (void);
 time_t http_atotm (const char *);
--- a/src/iri.c
+++ b/src/iri.c
@ -46,18 +46,6 @@ as that of the covered work.  */

 /* Note: locale encoding is kept in options struct (opt.locale) */

-/* Hold the encoding used for the current fetch */
-char *remote;
-
-/* Hold the encoding for the future found links */
-char *current;
-
-/* Will/Is the current URL encoded in utf8 ? */
-bool utf8_encode;
-
-/* Force no utf8 encoding for url_parse () */
-bool ugly_no_encode;
-
 static iconv_t locale2utf8;

 static bool open_locale_to_utf8 (void);
@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
   on error. */
 char *
-idn_encode (char *host, bool utf8_encoded)
+idn_encode (struct iri *i, char *host)
 {
  char *new;
  int ret;

-  /* Encode to UTF-8 if not done using current remote */
-  if (!utf8_encoded)
+  /* Encode to UTF-8 if not done */
+  if (!i->utf8_encode)
    {
-      if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+      if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
        {
          /* Nothing to encode or an error occured */
          return NULL;
@ -291,7 +279,7 @@ idn_decode (char *host)
 /* Try to transcode string str from remote encoding to UTF-8. On success, *new
   contains the transcoded string. *new content is unspecified otherwise. */
 bool
-remote_to_utf8 (const char *str, const char **new)
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
 {
  char *r;
  iconv_t cd;
@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new)

  if (opt.encoding_remote)
    r = opt.encoding_remote;
-  else if (current)
-    r = current;
+  else if (i->uri_encoding)
+    r = i->uri_encoding;
  else
    return false;

@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new)
  return ret;
 }

-char *get_remote_charset (void)
+struct iri *
+iri_new (void)
 {
-  return remote;
-}
-
-char *get_current_charset (void)
-{
-  return current;
-}
-
-void set_current_charset (char *charset)
-{
-  /*printf("[ current = `%s'\n", charset);*/
-  if (current)
-    {
-      /* Do nothing if already equal */
-      if (!strcasecmp (current, charset))
-        return;
-      xfree (current);
-    }
-
-  current = charset ? xstrdup (charset) : NULL;
-}
-
-void set_current_as_locale (void)
-{
-  /* sXXXav : assert opt.locale NULL ? */
-  /*printf("[ current = locale = `%s'\n", opt.locale);*/
-  if (current)
-    {
-      if (!strcasecmp (current, opt.locale))
-        return;
-      xfree (current);
-    }
-
-  current = xstrdup (opt.locale);
+  struct iri *i = xmalloc (sizeof (struct iri));
+  i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+  i->content_encoding = NULL;
+  i->utf8_encode = opt.enable_iri;
 }

 void
-set_remote_charset (char *charset)
+iri_free (struct iri *i)
 {
-  /*printf("[ remote = `%s'\n", charset);*/
-  if (remote)
-    {
-      /* Do nothing if already equal */
-      if (!strcasecmp (remote, charset))
-        return;
-      xfree (remote);
-    }
-  remote = charset ? xstrdup (charset) : NULL;
+  xfree_null (i->uri_encoding);
+  xfree_null (i->content_encoding);
+  xfree (i);
 }

 void
-set_remote_as_current (void)
+set_uri_encoding (struct iri *i, char *charset)
 {
-  /*printf("[ remote = current = `%s'\n", current);*/
-  if (remote)
+  logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
+  if (opt.encoding_remote)
+    return;
+  if (i->uri_encoding)
    {
-      /* Do nothing if already equal */
-      if (current && !strcasecmp (remote, current))
+      if (!strcasecmp (i->uri_encoding, charset))
        return;
-      xfree (remote);
+      xfree (i->uri_encoding);
    }

-  remote = current ? xstrdup (current) : NULL;
+  i->uri_encoding = charset ? xstrdup (charset) : NULL;
 }

-void reset_utf8_encode (void)
+void
+set_content_encoding (struct iri *i, char *charset)
 {
-  set_utf8_encode (opt.enable_iri);
-}
-
-void set_utf8_encode (bool encode)
-{
-  utf8_encode = encode;
-}
-
-bool get_utf8_encode (void)
-{
-  return (!ugly_no_encode && utf8_encode);
-}
-
-void set_ugly_no_encode (bool ugly)
-{
-  ugly_no_encode = ugly;
+  logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
+  if (opt.encoding_remote)
+    return;
+  if (i->content_encoding)
+    {
+      if (!strcasecmp (i->content_encoding, charset))
+        return;
+      xfree (i->content_encoding);
+    }
+
+  i->content_encoding = charset ? xstrdup (charset) : NULL;
 }

--- a/src/iri.h
+++ b/src/iri.h
@ -30,49 +30,41 @@ as that of the covered work.  */
 #ifndef IRI_H
 #define IRI_H

+struct iri {
+  char *uri_encoding;     /* Encoding of the uri to fetch */
+  char *content_encoding;  /* Encoding of links inside the fetched file */
+  bool utf8_encode;       /* Will/Is the current url encoded in utf8 */
+};
+
 #ifdef ENABLE_IRI

 char *parse_charset (char *str);
 char *find_locale (void);
 bool check_encoding_name (char *encoding);
 const char *locale_to_utf8 (const char *str);
-char *idn_encode (char *host, bool utf8_encoded);
+char *idn_encode (struct iri *i, char *host);
 char *idn_decode (char *host);
-char *get_remote_charset (void);
-char *get_current_charset (void);
-void set_current_charset (char *charset);
-void set_current_as_locale (void);
-void set_current_charset (char *charset);
-void set_remote_charset (char *charset);
-void set_remote_as_current (void);
-bool remote_to_utf8 (const char *str, const char **new);
-void reset_utf8_encode (void);
-void set_utf8_encode (bool encode);
-bool get_utf8_encode (void);
-
-/* ugly ugly ugly */
-void set_ugly_no_encode (bool ugly);
+bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
+struct iri *iri_new (void);
+void iri_free (struct iri *i);
+void set_uri_encoding (struct iri *i, char *charset);
+void set_content_encoding (struct iri *i, char *charset);

 #else /* ENABLE_IRI */

+struct iri dummy_iri;
+
 #define parse_charset(str)          NULL
 #define find_locale()               NULL
 #define check_encoding_name(str)    false
 #define locale_to_utf8(str)         (str)
-#define idn_encode(str,encoded)     NULL
+#define idn_encode(a,b,c)           NULL
 #define idn_decode(str)             NULL
-#define get_remote_charset()        NULL
-#define get_current_charset()       NULL
-#define set_current_charset(str)
-#define set_current_as_locale()
-#define set_current_charset(str)
-#define set_remote_charset(str)
-#define set_remote_as_current()
-#define remote_to_utf8(a,b)         false
-#define reset_utf8_encode()
-#define set_utf8_encode(a)
-#define get_utf8_encode()           false
-#define set_ugly_no_encode(a)
+#define remote_to_utf8(a,b,c)       false
+#define iri_new()                   (&dummy_iri)
+#define iri_free(a)
+#define set_uri_encoding(a,b)
+#define set_content_encoding(a,b)

 #endif /* ENABLE_IRI */
 #endif /* IRI_H */
--- a/src/main.c
+++ b/src/main.c
@ -57,7 +57,6 @@ as that of the covered work.  */
 #include "convert.h"
 #include "spider.h"
 #include "http.h"               /* for save_cookies */
-#include "iri.h"

 #include <getopt.h>
 #include <getpass.h>
@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\
      char *filename = NULL, *redirected_URL = NULL;
      int dt;

-      set_current_as_locale ();
-      set_ugly_no_encode (false);
-
      if ((opt.recursive || opt.page_requisites)
          && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
        {
@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\
        }
      else
        {
-          set_remote_as_current ();
-          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+          struct iri *i = iri_new ();
+          set_uri_encoding (i, opt.locale);
+          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
+                                 opt.recursive, i);
+          iri_free (i);
        }

      if (opt.delete_after && file_exists_p(filename))
--- a/src/recur.c
+++ b/src/recur.c
@ -61,7 +61,7 @@ struct queue_element {
  int depth;                    /* the depth */
  bool html_allowed;            /* whether the document is allowed to
                                   be treated as HTML. */
-  char *remote_encoding;
+  struct iri *iri;                /* sXXXav */
  bool css_allowed;             /* whether the document is allowed to
                                   be treated as CSS. */
  struct queue_element *next;   /* next element in queue */
@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue)
   into it.  */

 static void
-url_enqueue (struct url_queue *queue,
+url_enqueue (struct url_queue *queue, struct iri *i,
             const char *url, const char *referer, int depth,
             bool html_allowed, bool css_allowed)
 {
  struct queue_element *qel = xnew (struct queue_element);
-  char *charset = get_current_charset ();
+  qel->iri = i;
  qel->url = url;
  qel->referer = referer;
  qel->depth = depth;
@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue,
  qel->css_allowed = css_allowed;
  qel->next = NULL;

-  if (charset)
-    qel->remote_encoding = xstrdup (charset);
-  else
-    qel->remote_encoding = NULL;
-
  ++queue->count;
  if (queue->count > queue->maxcount)
    queue->maxcount = queue->count;
@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue,
  DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
  DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));

-  /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+  if (i)
+    printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);

  if (queue->tail)
    queue->tail->next = qel;
@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue,
   succeeded, or false if the queue is empty.  */

 static bool
-url_dequeue (struct url_queue *queue,
+url_dequeue (struct url_queue *queue, struct iri **i,
             const char **url, const char **referer, int *depth,
             bool *html_allowed, bool *css_allowed)
 {
@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue,
  if (!queue->head)
    queue->tail = NULL;

-  set_remote_charset (qel->remote_encoding);
-  if (qel->remote_encoding)
-    xfree (qel->remote_encoding);
-
+  *i = qel->iri;
  *url = qel->url;
  *referer = qel->referer;
  *depth = qel->depth;
@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue,
 }

 static bool download_child_p (const struct urlpos *, struct url *, int,
-                              struct url *, struct hash_table *);
+                              struct url *, struct hash_table *, struct iri *);
 static bool descend_redirect_p (const char *, const char *, int,
-                                struct url *, struct hash_table *);
+                                struct url *, struct hash_table *, struct iri *);


 /* Retrieve a part of the web beginning with START_URL.  This used to
@ -207,10 +200,10 @@ retrieve_tree (const char *start_url)

  int up_error_code;
  struct url *start_url_parsed;
+  struct iri *i = iri_new ();
+  set_uri_encoding (i, opt.locale);

-  set_ugly_no_encode (true);
-  start_url_parsed= url_parse (start_url, &up_error_code);
-  set_ugly_no_encode (false);
+  start_url_parsed = url_parse (start_url, &up_error_code, i);
  if (!start_url_parsed)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@ -223,7 +216,8 @@ retrieve_tree (const char *start_url)

  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
     just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+  url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+               false);
  string_set_add (blacklist, start_url_parsed->url);

  while (1)
@ -242,7 +236,7 @@ retrieve_tree (const char *start_url)

      /* Get the next URL from the queue... */

-      if (!url_dequeue (queue,
+      if (!url_dequeue (queue, (struct iri **) &i,
                        (const char **)&url, (const char **)&referer,
                        &depth, &html_allowed, &css_allowed))
        break;
@ -283,7 +277,8 @@ retrieve_tree (const char *start_url)
          int dt = 0;
          char *redirected = NULL;

-          status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+          status = retrieve_url (url, &file, &redirected, referer, &dt,
+                                 false, i);

          if (html_allowed && file && status == RETROK
              && (dt & RETROKF) && (dt & TEXTHTML))
@ -311,7 +306,7 @@ retrieve_tree (const char *start_url)
              if (descend)
                {
                  if (!descend_redirect_p (redirected, url, depth,
-                                           start_url_parsed, blacklist))
+                                           start_url_parsed, blacklist, i))
                    descend = false;
                  else
                    /* Make sure that the old pre-redirect form gets
@ -363,7 +358,7 @@ retrieve_tree (const char *start_url)
          bool meta_disallow_follow = false;
          struct urlpos *children
            = is_css ? get_urls_css_file (file, url) :
-                       get_urls_html (file, url, &meta_disallow_follow);
+                       get_urls_html (file, url, &meta_disallow_follow, i);

          if (opt.use_robots && meta_disallow_follow)
            {
@ -374,9 +369,8 @@ retrieve_tree (const char *start_url)
          if (children)
            {
              struct urlpos *child = children;
-              set_ugly_no_encode (true);
-              struct url *url_parsed = url_parse (url, NULL);
-              set_ugly_no_encode (false);
+              struct url *url_parsed = url_parse (url, NULL, i);
+              struct iri *ci;
              char *referer_url = url;
              bool strip_auth = (url_parsed != NULL
                                 && url_parsed->user != NULL);
@ -393,9 +387,11 @@ retrieve_tree (const char *start_url)
                  if (dash_p_leaf_HTML && !child->link_inline_p)
                    continue;
                  if (download_child_p (child, url_parsed, depth, start_url_parsed,
-                                        blacklist))
+                                        blacklist, i))
                    {
-                      url_enqueue (queue, xstrdup (child->url->url),
+                      ci = iri_new ();
+                      set_uri_encoding (ci, i->content_encoding);
+                      url_enqueue (queue, ci, xstrdup (child->url->url),
                                   xstrdup (referer_url), depth + 1,
                                   child->link_expect_html,
                                   child->link_expect_css);
@ -440,6 +436,7 @@ retrieve_tree (const char *start_url)
      xfree (url);
      xfree_null (referer);
      xfree_null (file);
+      iri_free (i);
    }

  /* If anything is left of the queue due to a premature exit, free it
@ -448,9 +445,11 @@ retrieve_tree (const char *start_url)
    char *d1, *d2;
    int d3;
    bool d4, d5;
-    while (url_dequeue (queue,
+    struct iri *d6;
+    while (url_dequeue (queue, (struct iri **)&d6,
                        (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
      {
+        iri_free (d6);
        xfree (d1);
        xfree_null (d2);
      }
@ -479,7 +478,8 @@ retrieve_tree (const char *start_url)

 static bool
 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
-                  struct url *start_url_parsed, struct hash_table *blacklist)
+                  struct url *start_url_parsed, struct hash_table *blacklist,
+                  struct iri *iri)
 {
  struct url *u = upos->url;
  const char *url = u->url;
@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
      if (!specs)
        {
          char *rfile;
-          if (res_retrieve_file (url, &rfile))
+          if (res_retrieve_file (url, &rfile, iri))
            {
              specs = res_parse_from_file (rfile);

@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,

 static bool
 descend_redirect_p (const char *redirected, const char *original, int depth,
-                    struct url *start_url_parsed, struct hash_table *blacklist)
+                    struct url *start_url_parsed, struct hash_table *blacklist,
+                    struct iri *iri)
 {
  struct url *orig_parsed, *new_parsed;
  struct urlpos *upos;
  bool success;

-  set_ugly_no_encode (true);
-  orig_parsed = url_parse (original, NULL);
+  orig_parsed = url_parse (original, NULL, NULL);
  assert (orig_parsed != NULL);

-  new_parsed = url_parse (redirected, NULL);
+  new_parsed = url_parse (redirected, NULL, NULL);
  assert (new_parsed != NULL);
-  set_ugly_no_encode (false);

  upos = xnew0 (struct urlpos);
  upos->url = new_parsed;

  success = download_child_p (upos, orig_parsed, depth,
-                              start_url_parsed, blacklist);
+                              start_url_parsed, blacklist, iri);

  url_free (orig_parsed);
  url_free (new_parsed);
--- a/src/res.c
+++ b/src/res.c
@ -532,21 +532,28 @@ res_get_specs (const char *host, int port)
   Return true if robots were retrieved OK, false otherwise.  */

 bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
 {
+  struct iri *i = iri_new ();
  uerr_t err;
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  int saved_ts_val = opt.timestamping;
  int saved_sp_val = opt.spider;

+  /* Copy server URI encoding for a possible IDNA transformation, no need to
+     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+  set_uri_encoding (i, iri->uri_encoding);
+  i->utf8_encode = false;
+
  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
  *file = NULL;
  opt.timestamping = false;
  opt.spider       = false;
-  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
  opt.timestamping = saved_ts_val;
-  opt.spider       = saved_sp_val;  
+  opt.spider       = saved_sp_val;
  xfree (robots_url);
+  iri_free (i);

  if (err != RETROK && *file != NULL)
    {
--- a/src/res.h
+++ b/src/res.h
@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *);
 void res_register_specs (const char *, int, struct robot_specs *);
 struct robot_specs *res_get_specs (const char *, int);

-bool res_retrieve_file (const char *, char **);
+bool res_retrieve_file (const char *, char **, struct iri *);

 bool is_robots_txt_url (const char *);

--- a/src/retr.c
+++ b/src/retr.c
@ -598,7 +598,7 @@ static char *getproxy (struct url *);

 uerr_t
 retrieve_url (const char *origurl, char **file, char **newloc,
-              const char *refurl, int *dt, bool recursive)
+              const char *refurl, int *dt, bool recursive, struct iri *iri)
 {
  uerr_t result;
  char *url;
@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  if (file)
    *file = NULL;

-  reset_utf8_encode ();
-
 second_try:
-  u = url_parse (url, &up_error_code);
+  u = url_parse (url, &up_error_code, iri);
  if (!u)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
      return URLERROR;
    }

-  /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+  printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);

  if (!refurl)
    refurl = opt.referer;
@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
  proxy = getproxy (u);
  if (proxy)
    {
-      /* sXXXav : support IRI for proxy */
+      /* sXXXav : could a proxy include a path ??? */
+      struct iri *pi = iri_new ();
+      set_uri_encoding (pi, opt.locale);
+      pi->utf8_encode = false;
+
      /* Parse the proxy URL.  */
-      set_ugly_no_encode (true);
-      proxy_url = url_parse (proxy, &up_error_code);
-      set_ugly_no_encode (false);
+      proxy_url = url_parse (proxy, &up_error_code, NULL);
      if (!proxy_url)
        {
          logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc,
 #endif
      || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
    {
-      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
    }
  else if (u->scheme == SCHEME_FTP)
    {
@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc,
      xfree (mynewloc);
      mynewloc = construced_newloc;

-      reset_utf8_encode ();
+      /* Reset UTF-8 encoding state, keep the URI encoding and reset
+         the content encoding. */
+      iri->utf8_encode = opt.enable_iri;
+      set_content_encoding (iri, NULL);

      /* Now, see if this new location makes sense. */
-      newloc_parsed = url_parse (mynewloc, &up_error_code);
+      newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
      if (!newloc_parsed)
        {
          logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc,
    }

  /* Try to not encode in UTF-8 if fetching failed */
-  if (!(*dt & RETROKF) && get_utf8_encode ())
+  if (!(*dt & RETROKF) && iri->utf8_encode)
    {
-      set_utf8_encode (false);
-      /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+      iri->utf8_encode = false;
+      printf ("[Fallbacking to non-utf8 for `%s'\n", url);
      goto second_try;
    }

@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count)
 {
  uerr_t status;
  struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();

  char *input_file = NULL;
  const char *url = file;

  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
-  
+
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_content_encoding (iri, opt.locale);
+
  if (url_has_scheme (url))
    {
      uerr_t status;
-      status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
+      status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri);
      if (status != RETROK)
        return status;
    }
  else
    input_file = (char *) file;

-  url_list = (html ? get_urls_html (input_file, NULL, NULL)
+  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
              : get_urls_file (input_file));

  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count)
          opt.follow_ftp = old_follow_ftp;
        }
      else
-        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+        status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
+	                       &dt, opt.recursive, iri);

      if (filename && opt.delete_after && file_exists_p (filename))
        {
@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url)
 {
  bool ret;
  struct url *u;
-  set_ugly_no_encode(true);
-  u= url_parse (url, NULL);
-  set_ugly_no_encode(false);
+  struct iri *i = iri_new();
+  /* url was given in the command line, so use locale as encoding */
+  set_uri_encoding (i, opt.locale);
+  u= url_parse (url, NULL, i);
  if (!u)
    return false;
  ret = getproxy (u) != NULL;
--- a/src/retr.h
+++ b/src/retr.h
@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
 char *fd_read_hunk (int, hunk_terminator_t, long, long);
 char *fd_read_line (int);

-uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
+                     bool, struct iri *);
 uerr_t retrieve_from_file (const char *, bool, int *);

 const char *retr_rate (wgint, double);
--- a/src/url.c
+++ b/src/url.c
@ -641,7 +641,7 @@ static const char *parse_errors[] = {
   error, and if ERROR is not NULL, also set *ERROR to the appropriate
   error code. */
 struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri)
 {
  struct url *u;
  const char *p;
@ -660,7 +660,7 @@ url_parse (const char *url, int *error)
  int port;
  char *user = NULL, *passwd = NULL;

-  char *url_encoded = NULL;
+  char *url_encoded = NULL, *new_url = NULL;

  int error_code;

@ -671,20 +671,20 @@ url_parse (const char *url, int *error)
      goto error;
    }

-  if (opt.enable_iri && get_utf8_encode ())
+  if (iri && iri->utf8_encode)
    {
-      const char *new;
-      bool utf8_encode;
      url_unescape ((char *) url);
-      utf8_encode = remote_to_utf8 (url, &new);
-      set_utf8_encode (utf8_encode);
-      if (utf8_encode)
-        url = new;
+      iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+      if (!iri->utf8_encode)
+        new_url = NULL;
    }

-  url_encoded = reencode_escapes (url);
+  url_encoded = reencode_escapes (new_url ? new_url : url);
  p = url_encoded;

+  if (new_url && url_encoded != new_url)
+    xfree (new_url);
+
  p += strlen (supported_schemes[scheme].leading_string);
  uname_b = p;
  p = url_skip_credentials (p);
@ -854,16 +854,17 @@ url_parse (const char *url, int *error)
    {
      url_unescape (u->host);
      host_modified = true;
-    }

-  if (opt.enable_iri)
-    {
-      char *new = idn_encode (u->host, get_utf8_encode ());
-      if (new)
+      /* Apply IDNA regardless of iri->utf8_encode status */
+      if (opt.enable_iri && iri)
        {
-          xfree (u->host);
-          u->host = new;
-          host_modified = true;
+          char *new = idn_encode (iri, u->host);
+          if (new)
+            {
+              xfree (u->host);
+              u->host = new;
+              host_modified = true;
+            }
        }
    }

--- a/src/url.h
+++ b/src/url.h
@ -84,7 +84,7 @@ struct url

 char *url_escape (const char *);

-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, struct iri *iri);
 const char *url_error (int);
 char *url_full_path (const struct url *);
 void url_set_dir (struct url *, const char *);
--- a/src/wget.h
+++ b/src/wget.h
@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT;
 #include "quote.h"
 #include "quotearg.h"

+/* Likewise for struct iri definition */
+#include "iri.h"
+
 /* Useful macros used across the code: */

 /* The number of elements in an array.  For example: