[svn] Removed.

2024-07-03 16:38:41 -04:00 · 2000-12-09 18:25:19 -08:00 · 2000-12-09 18:25:19 -08:00 · 3665d73453
commit 3665d73453
parent 2878b0644b
1 changed files with 0 additions and 684 deletions
--- a/src/html.c
+++ b/src/html.c
@ -1,684 +0,0 @@
-/* A simple HTML parser.
-   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
-
-This file is part of Wget.
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
-
-#include <config.h>
-
-#include <ctype.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <errno.h>
-
-#include "wget.h"
-#include "url.h"
-#include "utils.h"
-#include "ftp.h"
-#include "html.h"
-
-#ifndef errno
-extern int errno;
-#endif
-
-static state_t global_state;
-
-struct tag_attr {
-  char *tag;
-  char *attr;
-};
-
-
-/* Match a string against a null-terminated list of identifiers.  */
-static int
-idmatch (struct tag_attr *tags, const char *tag, const char *attr)
-{
-  int  i, j;
-  
-  if (tag == NULL || attr == NULL)
-    return FALSE;
-  
-  for (i = 0; tags[i].tag; i++)
-    /* Loop through all the tags wget ever cares about. */
-    if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
-      /* The tag and attribute matched one of the ones wget cares about. */
-      {
-	if (opt.ignore_tags)
-	  /* --ignore-tags was specified.  Do not match these specific tags.
-	     --ignore-tags takes precedence over --follow-tags, so we process
-	     --ignore first and fall through if there's no match. */
-	  for (j = 0; opt.ignore_tags[j] != NULL; j++)
-	    /* Loop through all the tags this user doesn't care about. */
-	    if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
-	      return FALSE;
-	
-	if (opt.follow_tags)
-	  /* --follow-tags was specified.  Only match these specific tags, so
-	     return FALSE if we don't match one of them. */
-	  {
-	    for (j = 0; opt.follow_tags[j] != NULL; j++)
-	      /* Loop through all the tags this user cares about. */
-	      if (strcasecmp(opt.follow_tags[j], tag) == EQ)
-		return TRUE;
-	    
-	    return FALSE;  /* wasn't one of the explicitly desired tags */
-	  }
-	
-	/* If we get to here, --follow-tags isn't being used, and --ignore-tags,
-	   if specified, didn't include this tag, so it's okay to follow. */
-	return TRUE;
-      }
-
-  return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
-}
-
-/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
-   describing URLs to follow.  When a tag is encountered, extract its
-   components (as described by html_allow[] array), and return the
-   address and the length of the string.  Return NULL if no URL is
-   found.  */
-const char *
-htmlfindurl (const char *buf, int bufsize, int *size, int init,
-	     int dash_p_leaf_HTML)
-{
-  const char *p, *ph;
-  state_t    *s = &global_state;
-
-  /* NULL-terminated list of tags and modifiers someone would want to
-     follow -- feel free to edit to suit your needs: */
-  static struct tag_attr html_allow[] = {
-    { "script", "src" },
-    { "img", "src" },
-    { "img", "href" },
-    { "body", "background" },
-    { "frame", "src" },
-    { "iframe", "src" },
-    { "fig", "src" },
-    { "overlay", "src" },
-    { "applet", "code" },
-    { "script", "src" },
-    { "embed", "src" },
-    { "bgsound", "src" },
-    { "img", "lowsrc" },
-    { "input", "src" },
-    { "layer", "src" },
-    { "table", "background"},
-    { "th", "background"},
-    { "td", "background"},
-    /* Tags below this line are treated specially.  */
-    { "a", "href" },
-    { "area", "href" },
-    { "base", "href" },
-    { "link", "href" },
-    { "link", "rel" },
-    { "meta", "content" },
-    { NULL, NULL }
-  };
-
-  if (init)
-    {
-      DEBUGP (("Resetting a parser state.\n"));
-      memset (s, 0, sizeof (*s));
-    }
-
-  while (1)
-    {
-      const char*  link_href = NULL;
-      const char*  link_rel = NULL;
-      int          link_href_saved_size = 0; /* init. just to shut up warning */
-
-      if (!bufsize)
-	break;
-      /* Let's look for a tag, if we are not already in one.  */
-      if (!s->at_value)
-	{
-	  /* Find '<'.  */
-	  if (*buf != '<')
-	    for (; bufsize && *buf != '<'; ++buf, --bufsize);
-	  if (!bufsize)
-	    break;
-	  /* Skip spaces.  */
-	  for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
-	       ++buf, --bufsize);
-	  if (!bufsize)
-	    break;
-	  p = buf;
-	  /* Find the tag end.  */
-	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
-	       ++buf, --bufsize);
-	  if (!bufsize)
-	    break;
-	  if (*buf == '=')
-	    {
-	      /* <tag=something> is illegal.  Just skip it.  */
-	      ++buf, --bufsize;
-	      continue;
-	    }
-	  if (p == buf)
-	    {
-	      /* *buf == '>'.  */
-	      ++buf, --bufsize;
-	      continue;
-	    }
-	  s->tag = strdupdelim (p, buf);
-	  if (*buf == '>')
-	    {
-	      xfree (s->tag);
-	      s->tag = NULL;
-	      ++buf, --bufsize;
-	      continue;
-	    }
-	}
-      else                      /* s->at_value */
-	{
-	  /* Reset AT_VALUE.  */
-	  s->at_value = 0;
-	  /* If in quotes, just skip out of them and continue living.  */
-	  if (s->in_quote)
-	    {
-	      s->in_quote = 0;
-	      for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
-	      if (!bufsize)
-		break;
-	      ++buf, --bufsize;
-	    }
-	  if (!bufsize)
-	    break;
-	  if (*buf == '>')
-	    {
-	      FREE_MAYBE (s->tag);
-	      FREE_MAYBE (s->attr);
-	      s->tag = s->attr = NULL;
-	      continue;
-	    }
-	}
-      /* Find the attributes.  */
-      do
-	{
-	  FREE_MAYBE (s->attr);
-	  s->attr = NULL;
-	  if (!bufsize)
-	    break;
-	  /* Skip the spaces if we have them.  We don't have them at
-	     places like <img alt="something"src="something-else">.
-	                                     ^ no spaces here */
-	  if (ISSPACE (*buf))
-	    for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
-		 ++buf, --bufsize);
-	  if (!bufsize || *buf == '>')
-	    break;
-	  if (*buf == '=')
-	    {
-	      /* This is the case of <tag = something>, which is
-		 illegal.  Just skip it.  */
-	      ++buf, --bufsize;
-	      continue;
-	    }
-	  p = buf;
-	  /* Find the attribute end.  */
-	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
-	       ++buf, --bufsize);
-	  if (!bufsize || *buf == '>')
-	    break;
-	  /* Construct the attribute.  */
-	  s->attr = strdupdelim (p, buf);
-	  /* Now we must skip the spaces to find '='.  */
-	  if (*buf != '=')
-	    {
-	      for (; bufsize && ISSPACE (*buf) && *buf != '>';
-		   ++buf, --bufsize);
-	      if (!bufsize || *buf == '>')
-		break;
-	    }
-	  /* If we still don't have '=', something is amiss.  */
-	  if (*buf != '=')
-	    continue;
-	  /* Find the beginning of attribute value by skipping the
-	     spaces.  */
-	  ++buf, --bufsize;
-	  for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
-	  if (!bufsize || *buf == '>')
-	    break;
-	  ph = NULL;
-	  /* The value of an attribute can, but does not have to be
-	     quoted.  */
-	  if (*buf == '\"' || *buf == '\'')
-	    {
-	      s->in_quote = 1;
-	      s->quote_char = *buf;
-	      p = buf + 1;
-	      for (++buf, --bufsize;
-		   bufsize && *buf != s->quote_char && *buf != '\n';
-		   ++buf, --bufsize)
-		if (!ph && *buf == '#' && *(buf - 1) != '&')
-		  ph = buf;
-	      if (!bufsize)
-		{
-		  s->in_quote = 0;
-		  break;
-		}
-	      if (*buf == '\n')
-		{
-		  /* #### Is the following logic good?
-
-		     Obviously no longer in quote.  It might be well
-		     to check whether '>' was encountered, but that
-		     would be encouraging writers of invalid HTMLs,
-		     and we don't want that, now do we?  */
-		  s->in_quote = 0;
-		  continue;
-		}
-	    }
-	  else
-	    {
-	      p = buf;
-	      for (; bufsize && !ISSPACE (*buf) && *buf != '>';
-		   ++buf, --bufsize)
-		if (!ph && *buf == '#' && *(buf - 1) != '&')
-		  ph = buf;
-	      if (!bufsize)
-		break;
-	    }
-	  /* If '#' was found unprotected in a URI, it is probably an
-	     HTML marker, or color spec.  */
-	  *size = (ph ? ph : buf) - p;
-	  /* The URI is liable to be returned if:
-	     1) *size != 0;
-	     2) its tag and attribute are found in html_allow.  */
-	  if (*size && idmatch (html_allow, s->tag, s->attr))
-	    {
-	      if (strcasecmp(s->tag, "a") == EQ ||
-		  strcasecmp(s->tag, "area") == EQ)
-		{
-		  /* Only follow these if we're not at a -p leaf node, as they
-		     always link to external documents. */
-		  if (!dash_p_leaf_HTML)
-		    {
-		      s->at_value = 1;
-		      return p;
-		    }
-		}
-	      else if (!strcasecmp (s->tag, "base") &&
-		       !strcasecmp (s->attr, "href"))
-		{
-		  FREE_MAYBE (s->base);
-		  s->base = strdupdelim (p, buf);
-		}
-	      else if (strcasecmp(s->tag, "link") == EQ)
-		{
-		  if (strcasecmp(s->attr, "href") == EQ)
-		    {
-		      link_href = p;
-		      link_href_saved_size = *size;  /* for restoration below */
-		    }
-		  else if (strcasecmp(s->attr, "rel") == EQ)
-		    link_rel = p;
-
-		  if (link_href != NULL && link_rel != NULL)
-		    /* Okay, we've now seen this <LINK> tag's HREF and REL
-		       attributes (they may be in either order), so it's now
-		       possible to decide if we want to traverse it. */
-		    if (!dash_p_leaf_HTML ||
-			strncasecmp(link_rel, "stylesheet",
-				    sizeof("stylesheet") - 1) == EQ)
-		      /* In the normal case, all <LINK> tags are fair game.
-			 
-			 In the special case of when -p is active, however, and
-			 we're at a leaf node (relative to the -l max. depth) in
-			 the HTML document tree, the only <LINK> tag we'll
-			 follow is a <LINK REL="stylesheet">, as it's necessary
-			 for displaying this document properly.  We won't follow
-			 other <LINK> tags, like <LINK REL="home">, for
-			 instance, as they refer to external documents.
-			 
-			 Note that the above strncasecmp() will incorrectly
-			 consider something like '<LINK REL="stylesheet.old"' as
-			 equivalent to '<LINK REL="stylesheet"'.  Not really
-			 worth the trouble to explicitly check for such cases --
-			 if time is spent, it should be spent ripping out wget's
-			 somewhat kludgy HTML parser and hooking in a real,
-			 componentized one. */
-		      {
-			/* When we return, the 'size' IN/OUT parameter
-			   determines where in the buffer the end of the current
-			   attribute value is.  If REL came after HREF in this
-			   <LINK> tag, size is currently set to the size for
-			   REL's value -- set it to what it was when we were
-			   looking at HREF's value. */
-			*size = link_href_saved_size;
-			
-			s->at_value = 1;
-			return link_href;
-		      }
-		}
-	      else if (!strcasecmp (s->tag, "meta") &&
-		       !strcasecmp (s->attr, "content"))
-		{
-		  /* Some pages use a META tag to specify that the page
-		     be refreshed by a new page after a given number of
-		     seconds.  We need to attempt to extract an URL for
-		     the new page from the other garbage present.  The
-		     general format for this is:                  
-		     <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
-
-		     So we just need to skip past the "0; URL="
-		     garbage to get to the URL.  META tags are also
-		     used for specifying random things like the page
-		     author's name and what editor was used to create
-		     it.  So we need to be careful to ignore them and
-		     not assume that an URL will be present at all.  */
-		  for (; *size && ISDIGIT (*p); p++, *size -= 1);
-		  if (*p == ';')
-		    {
-		      for (p++, *size -= 1;
-			   *size && ISSPACE (*p);
-			   p++, *size -= 1) ;
-		      if (!strncasecmp (p, "URL=", 4))
-			{
-			  p += 4, *size -= 4;
-			  s->at_value = 1;
-			  return p;
-			}
-		    }
-		}
-	      else
-		{
-		  s->at_value = 1;
-		  return p;
-		}
-	    }
-	  /* Exit from quote.  */
-	  if (*buf == s->quote_char)
-	    {
-	      s->in_quote = 0;
-	      ++buf, --bufsize;
-	    }
-	} while (*buf != '>');
-      FREE_MAYBE (s->tag);
-      FREE_MAYBE (s->attr);
-      s->tag = s->attr = NULL;
-      if (!bufsize)
-	break;
-    }
-
-  FREE_MAYBE (s->tag);
-  FREE_MAYBE (s->attr);
-  FREE_MAYBE (s->base);
-  memset (s, 0, sizeof (*s));	/* just to be sure */
-  DEBUGP (("HTML parser ends here (state destroyed).\n"));
-  return NULL;
-}
-
-/* The function returns the base reference of HTML buffer id, or NULL
-   if one wasn't defined for that buffer.  */
-const char *
-html_base (void)
-{
-  return global_state.base;
-}
-
-/* Create a malloc'ed copy of text in the range [beg, end), but with
-   the HTML entities processed.  Recognized entities are &lt, &gt,
-   &amp, &quot, &nbsp and the numerical entities.  */
-
-char *
-html_decode_entities (const char *beg, const char *end)
-{
-  char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
-  const char *from = beg;
-  char *to = newstr;
-
-  while (from < end)
-    {
-      if (*from != '&')
-	*to++ = *from++;
-      else
-	{
-	  const char *save = from;
-	  int remain;
-
-	  if (++from == end) goto lose;
-	  remain = end - from;
-
-	  if (*from == '#')
-	    {
-	      int numeric;
-	      ++from;
-	      if (from == end || !ISDIGIT (*from)) goto lose;
-	      for (numeric = 0; from < end && ISDIGIT (*from); from++)
-		numeric = 10 * numeric + (*from) - '0';
-	      if (from < end && ISALPHA (*from)) goto lose;
-	      numeric &= 0xff;
-	      *to++ = numeric;
-	    }
-#define FROB(literal) (remain >= (sizeof (literal) - 1)			\
-		 && !memcmp (from, literal, sizeof (literal) - 1)	\
-		 && (*(from + sizeof (literal) - 1) == ';'		\
-		     || remain == sizeof (literal) - 1			\
-		     || !ISALNUM (*(from + sizeof (literal) - 1))))
-	  else if (FROB ("lt"))
-	    *to++ = '<', from += 2;
-	  else if (FROB ("gt"))
-	    *to++ = '>', from += 2;
-	  else if (FROB ("amp"))
-	    *to++ = '&', from += 3;
-	  else if (FROB ("quot"))
-	    *to++ = '\"', from += 4;
-	  /* We don't implement the "Added Latin 1" entities proposed
-	     by rfc1866 (except for nbsp), because it is unnecessary
-	     in the context of Wget, and would require hashing to work
-	     efficiently.  */
-	  else if (FROB ("nbsp"))
-	    *to++ = 160, from += 4;
-	  else
-	    goto lose;
-#undef FROB
-	  /* If the entity was followed by `;', we step over the `;'.
-	     Otherwise, it was followed by either a non-alphanumeric
-	     or EOB, in which case we do nothing.  */
-	  if (from < end && *from == ';')
-	    ++from;
-	  continue;
-
-	lose:
-	  /* This was not an entity after all.  Back out.  */
-	  from = save;
-	  *to++ = *from++;
-	}
-    }
-  *to++ = '\0';
-  /* #### Should we try to do this: */
-#if 0
-  newstr = xrealloc (newstr, to - newstr);
-#endif
-  return newstr;
-}
-
-/* The function returns the pointer to the malloc-ed quoted version of
-   string s.  It will recognize and quote numeric and special graphic
-   entities, as per RFC1866:
-
-   `&' -> `&amp;'
-   `<' -> `&lt;'
-   `>' -> `&gt;'
-   `"' -> `&quot;'
-
-   No other entities are recognized or replaced.  */
-static char *
-html_quote_string (const char *s)
-{
-  const char *b = s;
-  char *p, *res;
-  int i;
-
-  /* Pass through the string, and count the new size.  */
-  for (i = 0; *s; s++, i++)
-    {
-      if (*s == '&')
-	i += 4;                /* `amp;' */
-      else if (*s == '<' || *s == '>')
-	i += 3;                /* `lt;' and `gt;' */
-      else if (*s == '\"')
-	i += 5;                /* `quot;' */
-    }
-  res = (char *)xmalloc (i + 1);
-  s = b;
-  for (p = res; *s; s++)
-    {
-      switch (*s)
-	{
-	case '&':
-	  *p++ = '&';
-	  *p++ = 'a';
-	  *p++ = 'm';
-	  *p++ = 'p';
-	  *p++ = ';';
-	  break;
-	case '<': case '>':
-	  *p++ = '&';
-	  *p++ = (*s == '<' ? 'l' : 'g');
-	  *p++ = 't';
-	  *p++ = ';';
-	  break;
-	case '\"':
-	  *p++ = '&';
-	  *p++ = 'q';
-	  *p++ = 'u';
-	  *p++ = 'o';
-	  *p++ = 't';
-	  *p++ = ';';
-	  break;
-	default:
-	  *p++ = *s;
-	}
-    }
-  *p = '\0';
-  return res;
-}
-
-/* The function creates an HTML index containing references to given
-   directories and files on the appropriate host.  The references are
-   FTP.  */
-uerr_t
-ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
-{
-  FILE *fp;
-  char *upwd;
-  char *htclfile;		/* HTML-clean file name */
-
-  if (!opt.dfp)
-    {
-      fp = fopen (file, "wb");
-      if (!fp)
-	{
-	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
-	  return FOPENERR;
-	}
-    }
-  else
-    fp = opt.dfp;
-  if (u->user)
-    {
-      char *tmpu, *tmpp;        /* temporary, clean user and passwd */
-
-      tmpu = CLEANDUP (u->user);
-      tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
-      upwd = (char *)xmalloc (strlen (tmpu)
-			     + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
-      sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
-      xfree (tmpu);
-      FREE_MAYBE (tmpp);
-    }
-  else
-    upwd = xstrdup ("");
-  fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
-  fprintf (fp, "<html>\n<head>\n<title>");
-  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
-  fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
-  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
-  fprintf (fp, "</h1>\n<hr>\n<pre>\n");
-  while (f)
-    {
-      fprintf (fp, "  ");
-      if (f->tstamp != -1)
-	{
-	  /* #### Should we translate the months? */
-	  static char *months[] = {
-	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
-	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
-	  };
-	  struct tm *ptm = localtime ((time_t *)&f->tstamp);
-
-	  fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
-		  ptm->tm_mday);
-	  if (ptm->tm_hour)
-	    fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
-	  else
-	    fprintf (fp, "       ");
-	}
-      else
-	fprintf (fp, _("time unknown       "));
-      switch (f->type)
-	{
-	case FT_PLAINFILE:
-	  fprintf (fp, _("File        "));
-	  break;
-	case FT_DIRECTORY:
-	  fprintf (fp, _("Directory   "));
-	  break;
-	case FT_SYMLINK:
-	  fprintf (fp, _("Link        "));
-	  break;
-	default:
-	  fprintf (fp, _("Not sure    "));
-	  break;
-	}
-      htclfile = html_quote_string (f->name);
-      fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
-      if (*u->dir != '/')
-	putc ('/', fp);
-      fprintf (fp, "%s", u->dir);
-      if (*u->dir)
-	putc ('/', fp);
-      fprintf (fp, "%s", htclfile);
-      if (f->type == FT_DIRECTORY)
-	putc ('/', fp);
-      fprintf (fp, "\">%s", htclfile);
-      if (f->type == FT_DIRECTORY)
-	putc ('/', fp);
-      fprintf (fp, "</a> ");
-      if (f->type == FT_PLAINFILE)
-	fprintf (fp, _(" (%s bytes)"), legible (f->size));
-      else if (f->type == FT_SYMLINK)
-	fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
-      putc ('\n', fp);
-      xfree (htclfile);
-      f = f->next;
-    }
-  fprintf (fp, "</pre>\n</body>\n</html>\n");
-  xfree (upwd);
-  if (!opt.dfp)
-    fclose (fp);
-  else
-    fflush (fp);
-  return FTPOK;
-}