[svn] Removed.

2024-07-03 16:38:41 -04:00 · 2000-12-09 18:25:19 -08:00 · 2000-12-09 18:25:19 -08:00 · 3665d73453
commit 3665d73453
parent 2878b0644b
1 changed files with 0 additions and 684 deletions
--- a/src/html.c
+++ b/src/html.c
@ -1,684 +0,0 @@
 /* A simple HTML parser.
   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
 This file is part of Wget.
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include <config.h>
 #include <ctype.h>
 #ifdef HAVE_STRING_H
 # include <string.h>
 #else
 # include <strings.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <errno.h>
 #include "wget.h"
 #include "url.h"
 #include "utils.h"
 #include "ftp.h"
 #include "html.h"
 #ifndef errno
 extern int errno;
 #endif
 static state_t global_state;
 struct tag_attr {
  char *tag;
  char *attr;
 };
 /* Match a string against a null-terminated list of identifiers.  */
 static int
 idmatch (struct tag_attr *tags, const char *tag, const char *attr)
 {
  int  i, j;
  if (tag == NULL || attr == NULL)
    return FALSE;
  for (i = 0; tags[i].tag; i++)
    /* Loop through all the tags wget ever cares about. */
    if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
      /* The tag and attribute matched one of the ones wget cares about. */
      {
 	if (opt.ignore_tags)
 	  /* --ignore-tags was specified.  Do not match these specific tags.
 	     --ignore-tags takes precedence over --follow-tags, so we process
 	     --ignore first and fall through if there's no match. */
 	  for (j = 0; opt.ignore_tags[j] != NULL; j++)
 	    /* Loop through all the tags this user doesn't care about. */
 	    if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
 	      return FALSE;
 	if (opt.follow_tags)
 	  /* --follow-tags was specified.  Only match these specific tags, so
 	     return FALSE if we don't match one of them. */
 	  {
 	    for (j = 0; opt.follow_tags[j] != NULL; j++)
 	      /* Loop through all the tags this user cares about. */
 	      if (strcasecmp(opt.follow_tags[j], tag) == EQ)
 		return TRUE;
 	    return FALSE;  /* wasn't one of the explicitly desired tags */
 	  }
 	/* If we get to here, --follow-tags isn't being used, and --ignore-tags,
 	   if specified, didn't include this tag, so it's okay to follow. */
 	return TRUE;
      }
  return FALSE;  /* not one of the tag/attribute pairs wget ever cares about */
 }
 /* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
   describing URLs to follow.  When a tag is encountered, extract its
   components (as described by html_allow[] array), and return the
   address and the length of the string.  Return NULL if no URL is
   found.  */
 const char *
 htmlfindurl (const char *buf, int bufsize, int *size, int init,
 	     int dash_p_leaf_HTML)
 {
  const char *p, *ph;
  state_t    *s = &global_state;
  /* NULL-terminated list of tags and modifiers someone would want to
     follow -- feel free to edit to suit your needs: */
  static struct tag_attr html_allow[] = {
    { "script", "src" },
    { "img", "src" },
    { "img", "href" },
    { "body", "background" },
    { "frame", "src" },
    { "iframe", "src" },
    { "fig", "src" },
    { "overlay", "src" },
    { "applet", "code" },
    { "script", "src" },
    { "embed", "src" },
    { "bgsound", "src" },
    { "img", "lowsrc" },
    { "input", "src" },
    { "layer", "src" },
    { "table", "background"},
    { "th", "background"},
    { "td", "background"},
    /* Tags below this line are treated specially.  */
    { "a", "href" },
    { "area", "href" },
    { "base", "href" },
    { "link", "href" },
    { "link", "rel" },
    { "meta", "content" },
    { NULL, NULL }
  };
  if (init)
    {
      DEBUGP (("Resetting a parser state.\n"));
      memset (s, 0, sizeof (*s));
    }
  while (1)
    {
      const char*  link_href = NULL;
      const char*  link_rel = NULL;
      int          link_href_saved_size = 0; /* init. just to shut up warning */
      if (!bufsize)
 	break;
      /* Let's look for a tag, if we are not already in one.  */
      if (!s->at_value)
 	{
 	  /* Find '<'.  */
 	  if (*buf != '<')
 	    for (; bufsize && *buf != '<'; ++buf, --bufsize);
 	  if (!bufsize)
 	    break;
 	  /* Skip spaces.  */
 	  for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 	       ++buf, --bufsize);
 	  if (!bufsize)
 	    break;
 	  p = buf;
 	  /* Find the tag end.  */
 	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 	       ++buf, --bufsize);
 	  if (!bufsize)
 	    break;
 	  if (*buf == '=')
 	    {
 	      /* <tag=something> is illegal.  Just skip it.  */
 	      ++buf, --bufsize;
 	      continue;
 	    }
 	  if (p == buf)
 	    {
 	      /* *buf == '>'.  */
 	      ++buf, --bufsize;
 	      continue;
 	    }
 	  s->tag = strdupdelim (p, buf);
 	  if (*buf == '>')
 	    {
 	      xfree (s->tag);
 	      s->tag = NULL;
 	      ++buf, --bufsize;
 	      continue;
 	    }
 	}
      else                      /* s->at_value */
 	{
 	  /* Reset AT_VALUE.  */
 	  s->at_value = 0;
 	  /* If in quotes, just skip out of them and continue living.  */
 	  if (s->in_quote)
 	    {
 	      s->in_quote = 0;
 	      for (; bufsize && *buf != s->quote_char; ++buf, --bufsize);
 	      if (!bufsize)
 		break;
 	      ++buf, --bufsize;
 	    }
 	  if (!bufsize)
 	    break;
 	  if (*buf == '>')
 	    {
 	      FREE_MAYBE (s->tag);
 	      FREE_MAYBE (s->attr);
 	      s->tag = s->attr = NULL;
 	      continue;
 	    }
 	}
      /* Find the attributes.  */
      do
 	{
 	  FREE_MAYBE (s->attr);
 	  s->attr = NULL;
 	  if (!bufsize)
 	    break;
 	  /* Skip the spaces if we have them.  We don't have them at
 	     places like <img alt="something"src="something-else">.
 	                                     ^ no spaces here */
 	  if (ISSPACE (*buf))
 	    for (++buf, --bufsize; bufsize && ISSPACE (*buf) && *buf != '>';
 		 ++buf, --bufsize);
 	  if (!bufsize || *buf == '>')
 	    break;
 	  if (*buf == '=')
 	    {
 	      /* This is the case of <tag = something>, which is
 		 illegal.  Just skip it.  */
 	      ++buf, --bufsize;
 	      continue;
 	    }
 	  p = buf;
 	  /* Find the attribute end.  */
 	  for (; bufsize && !ISSPACE (*buf) && *buf != '>' && *buf != '=';
 	       ++buf, --bufsize);
 	  if (!bufsize || *buf == '>')
 	    break;
 	  /* Construct the attribute.  */
 	  s->attr = strdupdelim (p, buf);
 	  /* Now we must skip the spaces to find '='.  */
 	  if (*buf != '=')
 	    {
 	      for (; bufsize && ISSPACE (*buf) && *buf != '>';
 		   ++buf, --bufsize);
 	      if (!bufsize || *buf == '>')
 		break;
 	    }
 	  /* If we still don't have '=', something is amiss.  */
 	  if (*buf != '=')
 	    continue;
 	  /* Find the beginning of attribute value by skipping the
 	     spaces.  */
 	  ++buf, --bufsize;
 	  for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
 	  if (!bufsize || *buf == '>')
 	    break;
 	  ph = NULL;
 	  /* The value of an attribute can, but does not have to be
 	     quoted.  */
 	  if (*buf == '\"' || *buf == '\'')
 	    {
 	      s->in_quote = 1;
 	      s->quote_char = *buf;
 	      p = buf + 1;
 	      for (++buf, --bufsize;
 		   bufsize && *buf != s->quote_char && *buf != '\n';
 		   ++buf, --bufsize)
 		if (!ph && *buf == '#' && *(buf - 1) != '&')
 		  ph = buf;
 	      if (!bufsize)
 		{
 		  s->in_quote = 0;
 		  break;
 		}
 	      if (*buf == '\n')
 		{
 		  /* #### Is the following logic good?
 		     Obviously no longer in quote.  It might be well
 		     to check whether '>' was encountered, but that
 		     would be encouraging writers of invalid HTMLs,
 		     and we don't want that, now do we?  */
 		  s->in_quote = 0;
 		  continue;
 		}
 	    }
 	  else
 	    {
 	      p = buf;
 	      for (; bufsize && !ISSPACE (*buf) && *buf != '>';
 		   ++buf, --bufsize)
 		if (!ph && *buf == '#' && *(buf - 1) != '&')
 		  ph = buf;
 	      if (!bufsize)
 		break;
 	    }
 	  /* If '#' was found unprotected in a URI, it is probably an
 	     HTML marker, or color spec.  */
 	  *size = (ph ? ph : buf) - p;
 	  /* The URI is liable to be returned if:
 	     1) *size != 0;
 	     2) its tag and attribute are found in html_allow.  */
 	  if (*size && idmatch (html_allow, s->tag, s->attr))
 	    {
 	      if (strcasecmp(s->tag, "a") == EQ ||
 		  strcasecmp(s->tag, "area") == EQ)
 		{
 		  /* Only follow these if we're not at a -p leaf node, as they
 		     always link to external documents. */
 		  if (!dash_p_leaf_HTML)
 		    {
 		      s->at_value = 1;
 		      return p;
 		    }
 		}
 	      else if (!strcasecmp (s->tag, "base") &&
 		       !strcasecmp (s->attr, "href"))
 		{
 		  FREE_MAYBE (s->base);
 		  s->base = strdupdelim (p, buf);
 		}
 	      else if (strcasecmp(s->tag, "link") == EQ)
 		{
 		  if (strcasecmp(s->attr, "href") == EQ)
 		    {
 		      link_href = p;
 		      link_href_saved_size = *size;  /* for restoration below */
 		    }
 		  else if (strcasecmp(s->attr, "rel") == EQ)
 		    link_rel = p;
 		  if (link_href != NULL && link_rel != NULL)
 		    /* Okay, we've now seen this <LINK> tag's HREF and REL
 		       attributes (they may be in either order), so it's now
 		       possible to decide if we want to traverse it. */
 		    if (!dash_p_leaf_HTML ||
 			strncasecmp(link_rel, "stylesheet",
 				    sizeof("stylesheet") - 1) == EQ)
 		      /* In the normal case, all <LINK> tags are fair game.
 			 In the special case of when -p is active, however, and
 			 we're at a leaf node (relative to the -l max. depth) in
 			 the HTML document tree, the only <LINK> tag we'll
 			 follow is a <LINK REL="stylesheet">, as it's necessary
 			 for displaying this document properly.  We won't follow
 			 other <LINK> tags, like <LINK REL="home">, for
 			 instance, as they refer to external documents.
 			 Note that the above strncasecmp() will incorrectly
 			 consider something like '<LINK REL="stylesheet.old"' as
 			 equivalent to '<LINK REL="stylesheet"'.  Not really
 			 worth the trouble to explicitly check for such cases --
 			 if time is spent, it should be spent ripping out wget's
 			 somewhat kludgy HTML parser and hooking in a real,
 			 componentized one. */
 		      {
 			/* When we return, the 'size' IN/OUT parameter
 			   determines where in the buffer the end of the current
 			   attribute value is.  If REL came after HREF in this
 			   <LINK> tag, size is currently set to the size for
 			   REL's value -- set it to what it was when we were
 			   looking at HREF's value. */
 			*size = link_href_saved_size;
 			s->at_value = 1;
 			return link_href;
 		      }
 		}
 	      else if (!strcasecmp (s->tag, "meta") &&
 		       !strcasecmp (s->attr, "content"))
 		{
 		  /* Some pages use a META tag to specify that the page
 		     be refreshed by a new page after a given number of
 		     seconds.  We need to attempt to extract an URL for
 		     the new page from the other garbage present.  The
 		     general format for this is:                  
 		     <META HTTP-EQUIV=Refresh CONTENT="0; URL=index2.html">
 		     So we just need to skip past the "0; URL="
 		     garbage to get to the URL.  META tags are also
 		     used for specifying random things like the page
 		     author's name and what editor was used to create
 		     it.  So we need to be careful to ignore them and
 		     not assume that an URL will be present at all.  */
 		  for (; *size && ISDIGIT (*p); p++, *size -= 1);
 		  if (*p == ';')
 		    {
 		      for (p++, *size -= 1;
 			   *size && ISSPACE (*p);
 			   p++, *size -= 1) ;
 		      if (!strncasecmp (p, "URL=", 4))
 			{
 			  p += 4, *size -= 4;
 			  s->at_value = 1;
 			  return p;
 			}
 		    }
 		}
 	      else
 		{
 		  s->at_value = 1;
 		  return p;
 		}
 	    }
 	  /* Exit from quote.  */
 	  if (*buf == s->quote_char)
 	    {
 	      s->in_quote = 0;
 	      ++buf, --bufsize;
 	    }
 	} while (*buf != '>');
      FREE_MAYBE (s->tag);
      FREE_MAYBE (s->attr);
      s->tag = s->attr = NULL;
      if (!bufsize)
 	break;
    }
  FREE_MAYBE (s->tag);
  FREE_MAYBE (s->attr);
  FREE_MAYBE (s->base);
  memset (s, 0, sizeof (*s));	/* just to be sure */
  DEBUGP (("HTML parser ends here (state destroyed).\n"));
  return NULL;
 }
 /* The function returns the base reference of HTML buffer id, or NULL
   if one wasn't defined for that buffer.  */
 const char *
 html_base (void)
 {
  return global_state.base;
 }
 /* Create a malloc'ed copy of text in the range [beg, end), but with
   the HTML entities processed.  Recognized entities are &lt, &gt,
   &amp, &quot, &nbsp and the numerical entities.  */
 char *
 html_decode_entities (const char *beg, const char *end)
 {
  char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
  const char *from = beg;
  char *to = newstr;
  while (from < end)
    {
      if (*from != '&')
 	*to++ = *from++;
      else
 	{
 	  const char *save = from;
 	  int remain;
 	  if (++from == end) goto lose;
 	  remain = end - from;
 	  if (*from == '#')
 	    {
 	      int numeric;
 	      ++from;
 	      if (from == end || !ISDIGIT (*from)) goto lose;
 	      for (numeric = 0; from < end && ISDIGIT (*from); from++)
 		numeric = 10 * numeric + (*from) - '0';
 	      if (from < end && ISALPHA (*from)) goto lose;
 	      numeric &= 0xff;
 	      *to++ = numeric;
 	    }
 #define FROB(literal) (remain >= (sizeof (literal) - 1)			\
 		 && !memcmp (from, literal, sizeof (literal) - 1)	\
 		 && (*(from + sizeof (literal) - 1) == ';'		\
 		     || remain == sizeof (literal) - 1			\
 		     || !ISALNUM (*(from + sizeof (literal) - 1))))
 	  else if (FROB ("lt"))
 	    *to++ = '<', from += 2;
 	  else if (FROB ("gt"))
 	    *to++ = '>', from += 2;
 	  else if (FROB ("amp"))
 	    *to++ = '&', from += 3;
 	  else if (FROB ("quot"))
 	    *to++ = '\"', from += 4;
 	  /* We don't implement the "Added Latin 1" entities proposed
 	     by rfc1866 (except for nbsp), because it is unnecessary
 	     in the context of Wget, and would require hashing to work
 	     efficiently.  */
 	  else if (FROB ("nbsp"))
 	    *to++ = 160, from += 4;
 	  else
 	    goto lose;
 #undef FROB
 	  /* If the entity was followed by `;', we step over the `;'.
 	     Otherwise, it was followed by either a non-alphanumeric
 	     or EOB, in which case we do nothing.  */
 	  if (from < end && *from == ';')
 	    ++from;
 	  continue;
 	lose:
 	  /* This was not an entity after all.  Back out.  */
 	  from = save;
 	  *to++ = *from++;
 	}
    }
  *to++ = '\0';
  /* #### Should we try to do this: */
 #if 0
  newstr = xrealloc (newstr, to - newstr);
 #endif
  return newstr;
 }
 /* The function returns the pointer to the malloc-ed quoted version of
   string s.  It will recognize and quote numeric and special graphic
   entities, as per RFC1866:
   `&' -> `&amp;'
   `<' -> `&lt;'
   `>' -> `&gt;'
   `"' -> `&quot;'
   No other entities are recognized or replaced.  */
 static char *
 html_quote_string (const char *s)
 {
  const char *b = s;
  char *p, *res;
  int i;
  /* Pass through the string, and count the new size.  */
  for (i = 0; *s; s++, i++)
    {
      if (*s == '&')
 	i += 4;                /* `amp;' */
      else if (*s == '<' || *s == '>')
 	i += 3;                /* `lt;' and `gt;' */
      else if (*s == '\"')
 	i += 5;                /* `quot;' */
    }
  res = (char *)xmalloc (i + 1);
  s = b;
  for (p = res; *s; s++)
    {
      switch (*s)
 	{
 	case '&':
 	  *p++ = '&';
 	  *p++ = 'a';
 	  *p++ = 'm';
 	  *p++ = 'p';
 	  *p++ = ';';
 	  break;
 	case '<': case '>':
 	  *p++ = '&';
 	  *p++ = (*s == '<' ? 'l' : 'g');
 	  *p++ = 't';
 	  *p++ = ';';
 	  break;
 	case '\"':
 	  *p++ = '&';
 	  *p++ = 'q';
 	  *p++ = 'u';
 	  *p++ = 'o';
 	  *p++ = 't';
 	  *p++ = ';';
 	  break;
 	default:
 	  *p++ = *s;
 	}
    }
  *p = '\0';
  return res;
 }
 /* The function creates an HTML index containing references to given
   directories and files on the appropriate host.  The references are
   FTP.  */
 uerr_t
 ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
 {
  FILE *fp;
  char *upwd;
  char *htclfile;		/* HTML-clean file name */
  if (!opt.dfp)
    {
      fp = fopen (file, "wb");
      if (!fp)
 	{
 	  logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 	  return FOPENERR;
 	}
    }
  else
    fp = opt.dfp;
  if (u->user)
    {
      char *tmpu, *tmpp;        /* temporary, clean user and passwd */
      tmpu = CLEANDUP (u->user);
      tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
      upwd = (char *)xmalloc (strlen (tmpu)
 			     + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
      sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
      xfree (tmpu);
      FREE_MAYBE (tmpp);
    }
  else
    upwd = xstrdup ("");
  fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
  fprintf (fp, "<html>\n<head>\n<title>");
  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
  fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
  fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
  fprintf (fp, "</h1>\n<hr>\n<pre>\n");
  while (f)
    {
      fprintf (fp, "  ");
      if (f->tstamp != -1)
 	{
 	  /* #### Should we translate the months? */
 	  static char *months[] = {
 	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 	  };
 	  struct tm *ptm = localtime ((time_t *)&f->tstamp);
 	  fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
 		  ptm->tm_mday);
 	  if (ptm->tm_hour)
 	    fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
 	  else
 	    fprintf (fp, "       ");
 	}
      else
 	fprintf (fp, _("time unknown       "));
      switch (f->type)
 	{
 	case FT_PLAINFILE:
 	  fprintf (fp, _("File        "));
 	  break;
 	case FT_DIRECTORY:
 	  fprintf (fp, _("Directory   "));
 	  break;
 	case FT_SYMLINK:
 	  fprintf (fp, _("Link        "));
 	  break;
 	default:
 	  fprintf (fp, _("Not sure    "));
 	  break;
 	}
      htclfile = html_quote_string (f->name);
      fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
      if (*u->dir != '/')
 	putc ('/', fp);
      fprintf (fp, "%s", u->dir);
      if (*u->dir)
 	putc ('/', fp);
      fprintf (fp, "%s", htclfile);
      if (f->type == FT_DIRECTORY)
 	putc ('/', fp);
      fprintf (fp, "\">%s", htclfile);
      if (f->type == FT_DIRECTORY)
 	putc ('/', fp);
      fprintf (fp, "</a> ");
      if (f->type == FT_PLAINFILE)
 	fprintf (fp, _(" (%s bytes)"), legible (f->size));
      else if (f->type == FT_SYMLINK)
 	fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
      putc ('\n', fp);
      xfree (htclfile);
      f = f->next;
    }
  fprintf (fp, "</pre>\n</body>\n</html>\n");
  xfree (upwd);
  if (!opt.dfp)
    fclose (fp);
  else
    fflush (fp);
  return FTPOK;
 }