From 3f84a5e00e255ab46e69d8ff5e565b238fce4126 Mon Sep 17 00:00:00 2001
From: hniksic <devnull@localhost>
Date: Wed, 26 Nov 2003 08:37:04 -0800
Subject: [PATCH] [svn] Squash embedded newlines in links.

---
 src/ChangeLog    |  5 +++++
 src/html-parse.c | 19 ++++++++++---------
 src/html-url.c   |  9 ++++++---
 3 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/src/ChangeLog b/src/ChangeLog
index 24ae86fa..c6a4f660 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
+2003-11-26  Hrvoje Niksic  <hniksic@xemacs.org>
+
+	* html-parse.c (convert_and_copy): Remove embedded newlines when
+	AP_TRIM_BLANKS is specified.
+
 2003-11-26  Hrvoje Niksic  <hniksic@xemacs.org>
 
 	* ftp.c: Set con->csock to -1 where rbuf_uninitialize was
diff --git a/src/html-parse.c b/src/html-parse.c
index 2a09ff09..4a86627e 100644
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -360,17 +360,16 @@ enum {
      the ASCII range when copying the string.
 
    * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
-     of text.  */
+     of text, as well as embedded newlines.  */
 
 static void
 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
 {
   int old_tail = pool->tail;
-  int size;
 
-  /* First, skip blanks if required.  We must do this before entities
-     are processed, so that blanks can still be inserted as, for
-     instance, `&#32;'.  */
+  /* Skip blanks if required.  We must do this before entities are
+     processed, so that blanks can still be inserted as, for instance,
+     `&#32;'.  */
   if (flags & AP_TRIM_BLANKS)
     {
       while (beg < end && ISSPACE (*beg))
@@ -378,7 +377,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
       while (end > beg && ISSPACE (end[-1]))
 	--end;
     }
-  size = end - beg;
 
   if (flags & AP_DECODE_ENTITIES)
     {
@@ -391,15 +389,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
 	 never lengthen it.  */
       const char *from = beg;
       char *to;
+      int squash_newlines = flags & AP_TRIM_BLANKS;
 
       POOL_GROW (pool, end - beg);
       to = pool->contents + pool->tail;
 
       while (from < end)
 	{
-	  if (*from != '&')
-	    *to++ = *from++;
-	  else
+	  if (*from == '&')
 	    {
 	      int entity = decode_entity (&from, end);
 	      if (entity != -1)
@@ -407,6 +404,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
 	      else
 		*to++ = *from++;
 	    }
+	  else if ((*from == '\n' || *from == '\r') && squash_newlines)
+	    ++from;
+	  else
+	    *to++ = *from++;
 	}
       /* Verify that we haven't exceeded the original size.  (It
 	 shouldn't happen, hence the assert.)  */
diff --git a/src/html-url.c b/src/html-url.c
index 89b93539..59d873b3 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -612,9 +612,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
     init_interesting ();
 
   /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
-     generate <a href=" foo"> instead of <a href="foo"> (Netscape
-     ignores spaces as well.)  If you really mean space, use &32; or
-     %20.  */
+     generate <a href=" foo"> instead of <a href="foo"> (browsers
+     ignore spaces as well.)  If you really mean space, use &32; or
+     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
+     e.g. in <img src="foo.[newline]html">.  Such newlines are also
+     ignored by IE and Mozilla and are presumably introduced by
+     writing HTML with editors that force word wrap.  */
   flags = MHT_TRIM_VALUES;
   if (opt.strict_comments)
     flags |= MHT_STRICT_COMMENTS;