[svn] Committed my patch from <sxsy9z4xz5m.fsf@florida.arsdigita.de>

(recognize HTML entities.)
2024-07-03 16:38:41 -04:00 · 2000-10-31 17:25:12 -08:00 · 2000-10-31 17:25:12 -08:00 · 515d82fb95
commit 515d82fb95
parent 846b045a69
4 changed files with 121 additions and 17 deletions
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,12 @@
 2000-11-01  Hrvoje Niksic  <hniksic@arsdigita.com>
 	* url.c (get_urls_html): Decode HTML entities using
 	html_decode_entities.
 	* html.c (htmlfindurl): Don't count the `#' in numeric entities
 	(&#NNN;) as an HTML fragemnt.
 	(html_decode_entities): New function.
 2000-11-01  Hrvoje Niksic  <hniksic@arsdigita.com>
 	* html.c (htmlfindurl): Fix recognition of # HTML fragments.
--- a/src/html.c
+++ b/src/html.c
@ -91,7 +91,6 @@ idmatch (struct tag_attr *tags, const char *tag, const char *attr)
  return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
 }
 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
@ -270,7 +269,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
 	      for (++buf, --bufsize;
 		   bufsize && *buf != s->quote_char && *buf != '\n';
 		   ++buf, --bufsize)
-		if (!ph && *buf == '#')
+		if (!ph && *buf == '#' && *(buf - 1) != '&')
 		  ph = buf;
 	      if (!bufsize)
 		{
@ -294,7 +293,7 @@ htmlfindurl (const char *buf, int bufsize, int *size, int init,
 	      p = buf;
 	      for (; bufsize && !ISSPACE (*buf) && *buf != '>';
 		   ++buf, --bufsize)
-		if (!ph && *buf == '#')
+		if (!ph && *buf == '#' && *(buf - 1) != '&')
 		  ph = buf;
 	      if (!bufsize)
 		break;
@ -437,6 +436,83 @@ html_base (void)
  return global_state.base;
 }
 /* Create a malloc'ed copy of text in the range [beg, end), but with
   the HTML entities processed.  Recognized entities are &lt, &gt,
   &amp, &quot, &nbsp and the numerical entities.  */
 char *
 html_decode_entities (const char *beg, const char *end)
 {
  char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
  const char *from = beg;
  char *to = newstr;
  while (from < end)
    {
      if (*from != '&')
 	*to++ = *from++;
      else
 	{
 	  const char *save = from;
 	  int remain;
 	  if (++from == end) goto lose;
 	  remain = end - from;
 	  if (*from == '#')
 	    {
 	      int numeric;
 	      ++from;
 	      if (from == end || !ISDIGIT (*from)) goto lose;
 	      for (numeric = 0; from < end && ISDIGIT (*from); from++)
 		numeric = 10 * numeric + (*from) - '0';
 	      if (from < end && ISALPHA (*from)) goto lose;
 	      numeric &= 0xff;
 	      *to++ = numeric;
 	    }
 #define FROB(literal) (remain >= (sizeof (literal) - 1)			\
 		 && !memcmp (from, literal, sizeof (literal) - 1)	\
 		 && (*(from + sizeof (literal) - 1) == ';'		\
 		     || remain == sizeof (literal) - 1			\
 		     || !ISALNUM (*(from + sizeof (literal) - 1))))
 	  else if (FROB ("lt"))
 	    *to++ = '<', from += 2;
 	  else if (FROB ("gt"))
 	    *to++ = '>', from += 2;
 	  else if (FROB ("amp"))
 	    *to++ = '&', from += 3;
 	  else if (FROB ("quot"))
 	    *to++ = '\"', from += 4;
 	  /* We don't implement the "Added Latin 1" entities proposed
 	     by rfc1866 (except for nbsp), because it is unnecessary
 	     in the context of Wget, and would require hashing to work
 	     efficiently.  */
 	  else if (FROB ("nbsp"))
 	    *to++ = 160, from += 4;
 	  else
 	    goto lose;
 #undef FROB
 	  /* If the entity was followed by `;', we step over the `;'.
 	     Otherwise, it was followed by either a non-alphanumeric
 	     or EOB, in which case we do nothing.  */
 	  if (from < end && *from == ';')
 	    ++from;
 	  continue;
 	lose:
 	  /* This was not an entity after all.  Back out.  */
 	  from = save;
 	  *to++ = *from++;
 	}
    }
  *to++ = '\0';
  /* #### Should we try to do this: */
 #if 0
  newstr = xrealloc (newstr, to - newstr);
 #endif
  return newstr;
 }
 /* The function returns the pointer to the malloc-ed quoted version of
   string s.  It will recognize and quote numeric and special graphic
   entities, as per RFC1866:
--- a/src/html.h
+++ b/src/html.h
@ -34,6 +34,7 @@ struct fileinfo;
 /* Function declarations */
 const char *htmlfindurl PARAMS ((const char *, int, int *, int, int));
 const char *html_base PARAMS ((void));
 char *html_decode_entities PARAMS ((const char *, const char *));
 uerr_t ftp_index PARAMS ((const char *, struct urlinfo *, struct fileinfo *));
 #endif /* HTML_H */
--- a/src/url.c
+++ b/src/url.c
@ -869,6 +869,7 @@ get_urls_html (const char *file, const char *this_url, int silent,
      const char *pbuf = buf;
      char *constr, *base;
      const char *cbase;
      char *needs_freeing, *url_data;
      first_time = 0;
@ -889,16 +890,27 @@ get_urls_html (const char *file, const char *this_url, int silent,
      if (!size)
 	break;
      /* It would be nice if we could avoid allocating memory in this
         loop, but I don't see an easy way.  To process the entities,
         we need to either copy the data, or change it destructively.
         I choose the former.
 	 We have two pointers: needs_freeing and url_data, because the
 	 code below does thing like url_data += <something>, and we
 	 want to pass the original string to free(). */
      needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
      size = strlen (url_data);
      for (i = 0; protostrings[i]; i++)
 	{
-	  if (!strncasecmp (protostrings[i], pbuf,
+	  if (!strncasecmp (protostrings[i], url_data,
 			    MINVAL (strlen (protostrings[i]), size)))
 	    break;
 	}
      /* Check for http:RELATIVE_URI.  See below for details.  */
      if (protostrings[i]
-	  && !(strncasecmp (pbuf, "http:", 5) == 0
+	  && !(strncasecmp (url_data, "http:", 5) == 0
-	       && strncasecmp (pbuf, "http://", 7) != 0))
+	       && strncasecmp (url_data, "http://", 7) != 0))
 	{
 	  no_proto = 0;
 	}
@ -909,20 +921,23 @@ get_urls_html (const char *file, const char *this_url, int silent,
 	     relative URI-s as <a href="http:URL">.  Just strip off the
 	     silly leading "http:" (as well as any leading blanks
 	     before it).  */
-	  if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
+	  if ((size > 5) && !strncasecmp ("http:", url_data, 5))
-	    pbuf += 5, size -= 5;
+	    url_data += 5, size -= 5;
 	}
      if (!no_proto)
 	{
 	  for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 	    {
-	      if (!strncasecmp (sup_protos[i].name, pbuf,
+	      if (!strncasecmp (sup_protos[i].name, url_data,
 			       MINVAL (strlen (sup_protos[i].name), size)))
 		break;
 	    }
 	  /* Do *not* accept a non-supported protocol.  */
 	  if (i == ARRAY_SIZE (sup_protos))
-	    continue;
+	    {
 	      free (needs_freeing);
 	      continue;
 	    }
 	}
      if (no_proto)
 	{
@ -945,13 +960,14 @@ get_urls_html (const char *file, const char *this_url, int silent,
 		  /* Use malloc, not alloca because this is called in
                     a loop. */
 		  char *temp = (char *)malloc (size + 1);
-		  strncpy (temp, pbuf, size);
+		  strncpy (temp, url_data, size);
 		  temp[size] = '\0';
 		  logprintf (LOG_NOTQUIET,
 			     _("Error (%s): Link %s without a base provided.\n"),
 			     file, temp);
 		  free (temp);
 		}
 	      free (needs_freeing);
 	      continue;
 	    }
 	  if (this_url)
@ -966,17 +982,18 @@ get_urls_html (const char *file, const char *this_url, int silent,
 		  logprintf (LOG_NOTQUIET, _("\
 Error (%s): Base %s relative, without referer URL.\n"),
 			     file, cbase);
 		  free (needs_freeing);
 		  continue;
 		}
 	      base = xstrdup (cbase);
 	    }
-	  constr = construct (base, pbuf, size, no_proto);
+	  constr = construct (base, url_data, size, no_proto);
 	  free (base);
 	}
      else /* has proto */
 	{
 	  constr = (char *)xmalloc (size + 1);
-	  strncpy (constr, pbuf, size);
+	  strncpy (constr, url_data, size);
 	  constr[size] = '\0';
 	}
 #ifdef DEBUG
@ -988,7 +1005,7 @@ Error (%s): Base %s relative, without referer URL.\n"),
 	  tmp2 = html_base ();
 	  /* Use malloc, not alloca because this is called in a loop. */
 	  tmp = (char *)xmalloc (size + 1);
-	  strncpy (tmp, pbuf, size);
+	  strncpy (tmp, url_data, size);
 	  tmp[size] = '\0';
 	  logprintf (LOG_ALWAYS,
 		     "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
@ -1009,14 +1026,15 @@ Error (%s): Base %s relative, without referer URL.\n"),
      memset (current, 0, sizeof (*current));
      current->next = NULL;
      current->url = constr;
-      current->size = size;
+      current->size = step;
-      current->pos = pbuf - orig_buf;
+      current->pos = buf - orig_buf;
      /* A URL is relative if the host and protocol are not named,
 	 and the name does not start with `/'.  */
-      if (no_proto && *pbuf != '/')
+      if (no_proto && *url_data != '/')
 	current->flags |= (URELATIVE | UNOPROTO);
      else if (no_proto)
 	current->flags |= UNOPROTO;
      free (needs_freeing);
    }
  free (orig_buf);