wget/src/recur.c

/* Handling of recursive HTTP retrieving.
   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.

This file is part of GNU Wget.

GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Wget; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */

#include <config.h>

#include <stdio.h>
#include <stdlib.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif /* HAVE_STRING_H */
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <assert.h>
#include <sys/types.h>

#include "wget.h"
#include "url.h"
#include "recur.h"
#include "utils.h"
#include "retr.h"
#include "ftp.h"
#include "fnmatch.h"
#include "host.h"
#include "hash.h"
#include "res.h"

#ifndef errno
extern int errno;
#endif

extern char *version_string;

static struct hash_table *dl_file_url_map;
static struct hash_table *dl_url_file_map;

/* List of HTML files downloaded in this Wget run.  Used for link
   conversion after Wget is done.  */
static slist *downloaded_html_files;

/* List of undesirable-to-load URLs.  */
static struct hash_table *undesirable_urls;

/* Current recursion depth.  */
static int depth;

/* Base directory we're recursing from (used by no_parent).  */
static char *base_dir;

static int first_time = 1;


/* Cleanup the data structures associated with recursive retrieving
   (the variables above).  */
void
recursive_cleanup (void)
{
  if (undesirable_urls)
    {
      string_set_free (undesirable_urls);
      undesirable_urls = NULL;
    }
  if (dl_file_url_map)
    {
      free_keys_and_values (dl_file_url_map);
      hash_table_destroy (dl_file_url_map);
      dl_file_url_map = NULL;
    }
  if (dl_url_file_map)
    {
      free_keys_and_values (dl_url_file_map);
      hash_table_destroy (dl_url_file_map);
      dl_url_file_map = NULL;
    }
  undesirable_urls = NULL;
  slist_free (downloaded_html_files);
  downloaded_html_files = NULL;
  FREE_MAYBE (base_dir);
  first_time = 1;
}

/* Reset FIRST_TIME to 1, so that some action can be taken in
   recursive_retrieve().  */
void
recursive_reset (void)
{
  first_time = 1;
}

/* The core of recursive retrieving.  Endless recursion is avoided by
   having all URLs stored to a linked list of URLs, which is checked
   before loading any URL.  That way no URL can get loaded twice.

   The function also supports specification of maximum recursion depth
   and a number of other goodies.  */
uerr_t
recursive_retrieve (const char *file, const char *this_url)
{
  char *constr, *filename, *newloc;
  char *canon_this_url = NULL;
  int dt, inl, dash_p_leaf_HTML = FALSE;
  int meta_disallow_follow;
  int this_url_ftp;            /* See below the explanation */
  uerr_t err;
  urlpos *url_list, *cur_url;
  struct urlinfo *u;

  assert (this_url != NULL);
  assert (file != NULL);
  /* If quota was exceeded earlier, bail out.  */
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  /* Cache the current URL in the list.  */
  if (first_time)
    {
      /* These three operations need to be done only once per Wget
         run.  They should probably be at a different location.  */
      if (!undesirable_urls)
	undesirable_urls = make_string_hash_table (0);

      hash_table_clear (undesirable_urls);
      string_set_add (undesirable_urls, this_url);
      /* Enter this_url to the hash table, in original and "enhanced" form.  */
      u = newurl ();
      err = parseurl (this_url, u, 0);
      if (err == URLOK)
	{
	  string_set_add (undesirable_urls, u->url);
	  if (opt.no_parent)
	    base_dir = xstrdup (u->dir); /* Set the base dir.  */
	  /* Set the canonical this_url to be sent as referer.  This
	     problem exists only when running the first time.  */
	  canon_this_url = xstrdup (u->url);
	}
      else
	{
	  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	  base_dir = NULL;
	}
      freeurl (u, 1);
      depth = 1;
      first_time = 0;
    }
  else
    ++depth;

  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
    /* We've exceeded the maximum recursion depth specified by the user. */
    {
      if (opt.page_requisites && depth <= opt.reclevel + 1)
	/* When -p is specified, we can do one more partial recursion from the
	   "leaf nodes" on the HTML document tree.  The recursion is partial in
	   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
	   except for <LINK REL="stylesheet">. */
	dash_p_leaf_HTML = TRUE;
      else
	/* Either -p wasn't specified or it was and we've already gone the one
	   extra (pseudo-)level that it affords us, so we need to bail out. */
	{
	  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
		   depth, opt.reclevel));
	  --depth;
	  return RECLEVELEXC;
	}
    }

  /* Determine whether this_url is an FTP URL.  If it is, it means
     that the retrieval is done through proxy.  In that case, FTP
     links will be followed by default and recursion will not be
     turned off when following them.  */
  this_url_ftp = (url_scheme (this_url) == SCHEME_FTP);

  /* Get the URL-s from an HTML file: */
  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
			    dash_p_leaf_HTML, &meta_disallow_follow);

  if (opt.use_robots && meta_disallow_follow)
    {
      /* The META tag says we are not to follow this file.  Respect
         that.  */
      free_urlpos (url_list);
      url_list = NULL;
    }

  /* Decide what to do with each of the URLs.  A URL will be loaded if
     it meets several requirements, discussed later.  */
  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
    {
      /* If quota was exceeded earlier, bail out.  */
      if (downloaded_exceeds_quota ())
	break;
      /* Parse the URL for convenient use in other functions, as well
	 as to get the optimized form.  It also checks URL integrity.  */
      u = newurl ();
      if (parseurl (cur_url->url, u, 0) != URLOK)
	{
	  DEBUGP (("Yuck!  A bad URL.\n"));
	  freeurl (u, 1);
	  continue;
	}
      assert (u->url != NULL);
      constr = xstrdup (u->url);

      /* Several checkings whether a file is acceptable to load:
	 1. check if URL is ftp, and we don't load it
	 2. check for relative links (if relative_only is set)
	 3. check for domain
	 4. check for no-parent
	 5. check for excludes && includes
	 6. check for suffix
	 7. check for same host (if spanhost is unset), with possible
	 gethostbyname baggage
	 8. check for robots.txt

	 Addendum: If the URL is FTP, and it is to be loaded, only the
	 domain and suffix settings are "stronger".

	 Note that .html and (yuck) .htm will get loaded regardless of
	 suffix rules (but that is remedied later with unlink) unless
	 the depth equals the maximum depth.

	 More time- and memory- consuming tests should be put later on
	 the list.  */

      /* inl is set if the URL we are working on (constr) is stored in
	 undesirable_urls.  Using it is crucial to avoid unnecessary
	 repeated continuous hits to the hash table.  */
      inl = string_set_contains (undesirable_urls, constr);

      /* If it is FTP, and FTP is not followed, chuck it out.  */
      if (!inl)
	if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp)
	  {
	    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If it is absolute link and they are not followed, chuck it
	 out.  */
      if (!inl && u->scheme != SCHEME_FTP)
	if (opt.relative_only && !cur_url->link_relative_p)
	  {
	    DEBUGP (("It doesn't really look like a relative link.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* If its domain is not to be accepted/looked-up, chuck it out.  */
      if (!inl)
	if (!accept_domain (u))
	  {
	    DEBUGP (("I don't like the smell of that domain.\n"));
	    string_set_add (undesirable_urls, constr);
	    inl = 1;
	  }
      /* Check for parent directory.  */
      if (!inl && opt.no_parent
	  /* If the new URL is FTP and the old was not, ignore
             opt.no_parent.  */
	  && !(!this_url_ftp && u->scheme == SCHEME_FTP))
	{
	  /* Check for base_dir first.  */
	  if (!(base_dir && frontcmp (base_dir, u->dir)))
	    {
	      /* Failing that, check for parent dir.  */
	      struct urlinfo *ut = newurl ();
	      if (parseurl (this_url, ut, 0) != URLOK)
		DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
	      else if (!frontcmp (ut->dir, u->dir))
		{
		  /* Failing that too, kill the URL.  */
		  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	      freeurl (ut, 1);
	    }
	}
      /* If the file does not match the acceptance list, or is on the
	 rejection list, chuck it out.  The same goes for the
	 directory exclude- and include- lists.  */
      if (!inl && (opt.includes || opt.excludes))
	{
	  if (!accdir (u->dir, ALLABS))
	    {
	      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}
      if (!inl)
	{
	  char *suf = NULL;
	  /* We check for acceptance/rejection rules only for non-HTML
	     documents.  Since we don't know whether they really are
	     HTML, it will be deduced from (an OR-ed list):

	     1) u->file is "" (meaning it is a directory)
	     2) suffix exists, AND:
	     a) it is "html", OR
	     b) it is "htm"

	     If the file *is* supposed to be HTML, it will *not* be
            subject to acc/rej rules, unless a finite maximum depth has
            been specified and the current depth is the maximum depth. */
	  if (!
	      (!*u->file
	       || (((suf = suffix (constr)) != NULL)
                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
                      && ((opt.reclevel != INFINITE_RECURSION) &&
			  (depth != opt.reclevel))))))
	    {
	      if (!acceptable (u->file))
		{
		  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
			  constr, u->file));
		  string_set_add (undesirable_urls, constr);
		  inl = 1;
		}
	    }
	  FREE_MAYBE (suf);
	}
      /* Optimize the URL (which includes possible DNS lookup) only
	 after all other possibilities have been exhausted.  */
      if (!inl)
	{
	  if (!opt.simple_check)
	    opt_url (u);
	  else
	    {
	      char *p;
	      /* Just lowercase the hostname.  */
	      for (p = u->host; *p; p++)
		*p = TOLOWER (*p);
	      xfree (u->url);
	      u->url = str_url (u, 0);
	    }
	  xfree (constr);
	  constr = xstrdup (u->url);
	  /* After we have canonicalized the URL, check if we have it
	     on the black list. */
	  if (string_set_contains (undesirable_urls, constr))
	    inl = 1;
	  /* This line is bogus. */
	  /*string_set_add (undesirable_urls, constr);*/

	  if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp))
	    if (!opt.spanhost && this_url && !same_host (this_url, constr))
	      {
		DEBUGP (("This is not the same hostname as the parent's.\n"));
		string_set_add (undesirable_urls, constr);
		inl = 1;
	      }
	}
      /* What about robots.txt?  */
      if (!inl && opt.use_robots && u->scheme == SCHEME_FTP)
	{
	  struct robot_specs *specs = res_get_specs (u->host, u->port);
	  if (!specs)
	    {
	      char *rfile;
	      if (res_retrieve_file (constr, &rfile))
		{
		  specs = res_parse_from_file (rfile);
		  xfree (rfile);
		}
	      else
		{
		  /* If we cannot get real specs, at least produce
		     dummy ones so that we can register them and stop
		     trying to retrieve them.  */
		  specs = res_parse ("", 0);
		}
	      res_register_specs (u->host, u->port, specs);
	    }

	  /* Now that we have (or don't have) robots.txt specs, we can
	     check what they say.  */
	  if (!res_match_path (specs, u->path))
	    {
	      DEBUGP (("Not following %s because robots.txt forbids it.\n",
		       constr));
	      string_set_add (undesirable_urls, constr);
	      inl = 1;
	    }
	}

      filename = NULL;
      /* If it wasn't chucked out, do something with it.  */
      if (!inl)
	{
	  DEBUGP (("I've decided to load it -> "));
	  /* Add it to the list of already-loaded URL-s.  */
	  string_set_add (undesirable_urls, constr);
	  /* Automatically followed FTPs will *not* be downloaded
	     recursively.  */
	  if (u->scheme == SCHEME_FTP)
	    {
	      /* Don't you adore side-effects?  */
	      opt.recursive = 0;
	    }
	  /* Reset its type.  */
	  dt = 0;
	  /* Retrieve it.  */
	  retrieve_url (constr, &filename, &newloc,
		       canon_this_url ? canon_this_url : this_url, &dt);
	  if (u->scheme == SCHEME_FTP)
	    {
	      /* Restore...  */
	      opt.recursive = 1;
	    }
	  if (newloc)
	    {
	      xfree (constr);
	      constr = newloc;
	    }
	  /* If there was no error, and the type is text/html, parse
	     it recursively.  */
	  if (dt & TEXTHTML)
	    {
	      if (dt & RETROKF)
		recursive_retrieve (filename, constr);
	    }
	  else
	    DEBUGP (("%s is not text/html so we don't chase.\n",
		     filename ? filename: "(null)"));

	  if (opt.delete_after || (filename && !acceptable (filename)))
	    /* Either --delete-after was specified, or we loaded this otherwise
	       rejected (e.g. by -R) HTML file just so we could harvest its
	       hyperlinks -- in either case, delete the local file. */
	    {
	      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
		       opt.delete_after ? "--delete-after" :
		       "recursive rejection criteria"));
	      logprintf (LOG_VERBOSE,
			 (opt.delete_after ? _("Removing %s.\n")
			  : _("Removing %s since it should be rejected.\n")),
			 filename);
	      if (unlink (filename))
		logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
	      dt &= ~RETROKF;
	    }

	  /* If everything was OK, and links are to be converted, let's
	     store the local filename.  */
	  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
	    {
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (filename);
	    }
	}
      else
	DEBUGP (("%s already in list, so we don't load.\n", constr));
      /* Free filename and constr.  */
      FREE_MAYBE (filename);
      FREE_MAYBE (constr);
      freeurl (u, 1);
      /* Increment the pbuf for the appropriate size.  */
    }
  if (opt.convert_links && !opt.delete_after)
    /* This is merely the first pass: the links that have been
       successfully downloaded are converted.  In the second pass,
       convert_all_links() will also convert those links that have NOT
       been downloaded to their canonical form.  */
    convert_links (file, url_list);
  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
  /* Free the canonical this_url.  */
  FREE_MAYBE (canon_this_url);
  /* Decrement the recursion depth.  */
  --depth;
  if (downloaded_exceeds_quota ())
    return QUOTEXC;
  else
    return RETROK;
}

void
register_download (const char *url, const char *file)
{
  if (!opt.convert_links)
    return;
  if (!dl_file_url_map)
    dl_file_url_map = make_string_hash_table (0);
  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
  if (!dl_url_file_map)
    dl_url_file_map = make_string_hash_table (0);
  hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
}

void
register_html (const char *url, const char *file)
{
  if (!opt.convert_links)
    return;
  downloaded_html_files = slist_prepend (downloaded_html_files, file);
}

/* convert_links() is called from recursive_retrieve() after we're
   done with an HTML file.  This call to convert_links is not complete
   because it converts only the downloaded files, and Wget cannot know
   which files will be downloaded afterwards.  So, if we have file
   fileone.html with:

   <a href="/c/something.gif">

   and /c/something.gif was not downloaded because it exceeded the
   recursion depth, the reference will *not* be changed.

   However, later we can encounter /c/something.gif from an "upper"
   level HTML (let's call it filetwo.html), and it gets downloaded.

   But now we have a problem because /c/something.gif will be
   correctly transformed in filetwo.html, but not in fileone.html,
   since Wget could not have known that /c/something.gif will be
   downloaded in the future.

   This is why Wget must, after the whole retrieval, call
   convert_all_links to go once more through the entire list of
   retrieved HTMLs, and re-convert them.

   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
   in urls_downloaded.  From these two lists information is
   extracted.  */
void
convert_all_links (void)
{
  slist *html;

  /* Destructively reverse downloaded_html_files to get it in the right order.
     recursive_retrieve() used slist_prepend() consistently.  */
  downloaded_html_files = slist_nreverse (downloaded_html_files);

  for (html = downloaded_html_files; html; html = html->next)
    {
      urlpos *urls, *cur_url;
      char *url;

      DEBUGP (("Rescanning %s\n", html->string));
      /* Determine the URL of the HTML file.  get_urls_html will need
	 it.  */
      url = hash_table_get (dl_file_url_map, html->string);
      if (url)
	DEBUGP (("It should correspond to %s.\n", url));
      else
	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
      urls = get_urls_html (html->string, url, FALSE, NULL);
      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
         links that have been followed from other files.  */
      for (cur_url = urls; cur_url; cur_url = cur_url->next)
	{
	  char *local_name;

	  /* The URL must be in canonical form to be compared.  */
	  struct urlinfo *u = newurl ();
	  uerr_t res = parseurl (cur_url->url, u, 0);
	  if (res != URLOK)
	    {
	      freeurl (u, 1);
	      continue;
	    }
	  /* We decide the direction of conversion according to whether
	     a URL was downloaded.  Downloaded URLs will be converted
	     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
	  local_name = hash_table_get (dl_url_file_map, u->url);
	  if (local_name)
	    DEBUGP (("%s marked for conversion, local %s\n",
		     u->url, local_name));
	  /* Decide on the conversion direction.  */
	  if (local_name)
	    {
	      /* We've downloaded this URL.  Convert it to relative
                 form.  We do this even if the URL already is in
                 relative form, because our directory structure may
                 not be identical to that on the server (think `-nd',
                 `--cut-dirs', etc.)  */
	      cur_url->convert = CO_CONVERT_TO_RELATIVE;
	      cur_url->local_name = xstrdup (local_name);
	    }
	  else
	    {
	      /* We haven't downloaded this URL.  If it's not already
                 complete (including a full host name), convert it to
                 that form, so it can be reached while browsing this
                 HTML locally.  */
	      if (!cur_url->link_complete_p)
		cur_url->convert = CO_CONVERT_TO_COMPLETE;
	      cur_url->local_name = NULL;
	    }
	  freeurl (u, 1);
	}
      /* Convert the links in the file.  */
      convert_links (html->string, urls);
      /* Free the data.  */
      free_urlpos (urls);
    }
}
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								/* Handling of recursive HTTP retrieving.
-												[svn] Update copyright blurbs with the year 2000.

											
										
										
											2000-11-01 20:50:03 -05:00
+								   Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Update copyright notices.

											
										
										
											2001-05-27 15:35:15 -04:00
+								This file is part of GNU Wget.
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Update copyright notices.

											
										
										
											2001-05-27 15:35:15 -04:00
+								GNU Wget is free software; you can redistribute it and/or modify
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								it under the terms of the GNU General Public License as published by
 								the Free Software Foundation; either version 2 of the License, or
 								(at your option) any later version.
-												[svn] Update copyright notices.

											
										
										
											2001-05-27 15:35:15 -04:00
+								GNU Wget is distributed in the hope that it will be useful,
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								but WITHOUT ANY WARRANTY; without even the implied warranty of
 								MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								GNU General Public License for more details.
 								You should have received a copy of the GNU General Public License
-												[svn] Update copyright notices.

											
										
										
											2001-05-27 15:35:15 -04:00
+								along with Wget; if not, write to the Free Software
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 								#include <config.h>
 								#include <stdio.h>
 								#include <stdlib.h>
 								#ifdef HAVE_STRING_H
 								# include <string.h>
 								#else
 								# include <strings.h>
 								#endif /* HAVE_STRING_H */
 								#ifdef HAVE_UNISTD_H
 								# include <unistd.h>
 								#endif /* HAVE_UNISTD_H */
 								#include <errno.h>
 								#include <assert.h>
 								#include <sys/types.h>
 								#include "wget.h"
 								#include "url.h"
 								#include "recur.h"
 								#include "utils.h"
 								#include "retr.h"
 								#include "ftp.h"
 								#include "fnmatch.h"
 								#include "host.h"
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								#include "hash.h"
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+								#include "res.h"
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Include <netdb.h> where h_errno is used.  Likewise for <errno.h> and errno.
From <sxsvgsi7wcw.fsf@florida.arsdigita.de>.

											
										
										
											2000-12-17 13:12:02 -05:00
+								#ifndef errno
 								extern int errno;
 								#endif
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								extern char *version_string;
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								static struct hash_table *dl_file_url_map;
 								static struct hash_table *dl_url_file_map;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								/* List of HTML files downloaded in this Wget run.  Used for link
 								   conversion after Wget is done.  */
 								static slist *downloaded_html_files;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								/* List of undesirable-to-load URLs.  */
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								static struct hash_table *undesirable_urls;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								/* Current recursion depth.  */
 								static int depth;
 								/* Base directory we're recursing from (used by no_parent).  */
 								static char *base_dir;
 								static int first_time = 1;
 								/* Cleanup the data structures associated with recursive retrieving
 								   (the variables above).  */
 								void
 								recursive_cleanup (void)
 								{
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								  if (undesirable_urls)
 								    {
 								      string_set_free (undesirable_urls);
 								      undesirable_urls = NULL;
 								    }
 								  if (dl_file_url_map)
 								    {
 								      free_keys_and_values (dl_file_url_map);
 								      hash_table_destroy (dl_file_url_map);
 								      dl_file_url_map = NULL;
 								    }
 								  if (dl_url_file_map)
 								    {
 								      free_keys_and_values (dl_url_file_map);
 								      hash_table_destroy (dl_url_file_map);
 								      dl_url_file_map = NULL;
 								    }
 								  undesirable_urls = NULL;
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								  slist_free (downloaded_html_files);
 								  downloaded_html_files = NULL;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								  FREE_MAYBE (base_dir);
 								  first_time = 1;
 								}
 								/* Reset FIRST_TIME to 1, so that some action can be taken in
 								   recursive_retrieve().  */
 								void
 								recursive_reset (void)
 								{
 								  first_time = 1;
 								}
 								/* The core of recursive retrieving.  Endless recursion is avoided by
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								   having all URLs stored to a linked list of URLs, which is checked
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								   before loading any URL.  That way no URL can get loaded twice.
 								   The function also supports specification of maximum recursion depth
 								   and a number of other goodies.  */
 								uerr_t
 								recursive_retrieve (const char *file, const char *this_url)
 								{
 								  char *constr, *filename, *newloc;
 								  char *canon_this_url = NULL;
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								  int dt, inl, dash_p_leaf_HTML = FALSE;
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								  int meta_disallow_follow;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								  int this_url_ftp;            /* See below the explanation */
 								  uerr_t err;
 								  urlpos *url_list, *cur_url;
 								  struct urlinfo *u;
 								  assert (this_url != NULL);
 								  assert (file != NULL);
 								  /* If quota was exceeded earlier, bail out.  */
-												[svn] Gracefully handle opt.downloaded overflowing.
Published in <sxsd7gfnv17.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-01 18:17:31 -05:00
+								  if (downloaded_exceeds_quota ())
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    return QUOTEXC;
 								  /* Cache the current URL in the list.  */
 								  if (first_time)
 								    {
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								      /* These three operations need to be done only once per Wget
 								         run.  They should probably be at a different location.  */
 								      if (!undesirable_urls)
 									undesirable_urls = make_string_hash_table (0);
 								      hash_table_clear (undesirable_urls);
 								      string_set_add (undesirable_urls, this_url);
 								      /* Enter this_url to the hash table, in original and "enhanced" form.  */
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								      u = newurl ();
 								      err = parseurl (this_url, u, 0);
 								      if (err == URLOK)
 									{
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									  string_set_add (undesirable_urls, u->url);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  if (opt.no_parent)
 									    base_dir = xstrdup (u->dir); /* Set the base dir.  */
 									  /* Set the canonical this_url to be sent as referer.  This
 									     problem exists only when running the first time.  */
 									  canon_this_url = xstrdup (u->url);
 									}
 								      else
 									{
 									  DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 									  base_dir = NULL;
 									}
 								      freeurl (u, 1);
 								      depth = 1;
 								      first_time = 0;
 								    }
 								  else
 								    ++depth;
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								  if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
 								    /* We've exceeded the maximum recursion depth specified by the user. */
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    {
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								      if (opt.page_requisites && depth <= opt.reclevel + 1)
 									/* When -p is specified, we can do one more partial recursion from the
 									   "leaf nodes" on the HTML document tree.  The recursion is partial in
 									   that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
 									   except for <LINK REL="stylesheet">. */
 									dash_p_leaf_HTML = TRUE;
 								      else
 									/* Either -p wasn't specified or it was and we've already gone the one
 									   extra (pseudo-)level that it affords us, so we need to bail out. */
 									{
 									  DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
 										   depth, opt.reclevel));
 									  --depth;
 									  return RECLEVELEXC;
 									}
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    }
 								  /* Determine whether this_url is an FTP URL.  If it is, it means
 								     that the retrieval is done through proxy.  In that case, FTP
 								     links will be followed by default and recursion will not be
 								     turned off when following them.  */
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+								  this_url_ftp = (url_scheme (this_url) == SCHEME_FTP);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								  /* Get the URL-s from an HTML file: */
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								  url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+											    dash_p_leaf_HTML, &meta_disallow_follow);
 								  if (opt.use_robots && meta_disallow_follow)
 								    {
 								      /* The META tag says we are not to follow this file.  Respect
 								         that.  */
 								      free_urlpos (url_list);
 								      url_list = NULL;
 								    }
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								  /* Decide what to do with each of the URLs.  A URL will be loaded if
 								     it meets several requirements, discussed later.  */
 								  for (cur_url = url_list; cur_url; cur_url = cur_url->next)
 								    {
 								      /* If quota was exceeded earlier, bail out.  */
-												[svn] Gracefully handle opt.downloaded overflowing.
Published in <sxsd7gfnv17.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-01 18:17:31 -05:00
+								      if (downloaded_exceeds_quota ())
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									break;
 								      /* Parse the URL for convenient use in other functions, as well
 									 as to get the optimized form.  It also checks URL integrity.  */
 								      u = newurl ();
 								      if (parseurl (cur_url->url, u, 0) != URLOK)
 									{
 									  DEBUGP (("Yuck!  A bad URL.\n"));
 									  freeurl (u, 1);
 									  continue;
 									}
 								      assert (u->url != NULL);
 								      constr = xstrdup (u->url);
 								      /* Several checkings whether a file is acceptable to load:
 . check if URL is ftp, and we don't load it
 . check for relative links (if relative_only is set)
 . check for domain
 . check for no-parent
 . check for excludes && includes
 . check for suffix
 . check for same host (if spanhost is unset), with possible
 									 gethostbyname baggage
 . check for robots.txt
 									 Addendum: If the URL is FTP, and it is to be loaded, only the
 									 domain and suffix settings are "stronger".
-												[svn] Applied Edward Sabol's patch.

											
										
										
											2000-03-02 08:28:31 -05:00
+									 Note that .html and (yuck) .htm will get loaded regardless of
 									 suffix rules (but that is remedied later with unlink) unless
 									 the depth equals the maximum depth.
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 									 More time- and memory- consuming tests should be put later on
 									 the list.  */
 								      /* inl is set if the URL we are working on (constr) is stored in
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									 undesirable_urls.  Using it is crucial to avoid unnecessary
 									 repeated continuous hits to the hash table.  */
-												[svn] Commit several minor changes:

* main.c (print_help): Document `--no-http-keep-alive'.

* utils.c (numdigit): Handle negative numbers *correctly*.

* hash.c (make_nocase_string_hash_table): Use term "nocase" rather
than the confusing "unsigned".

* utils.c (string_set_contains): Renamed from string_set_exists.

* hash.c (hash_table_contains): Renamed from hash_table_exists.

* cookies.c: Move case-insensitive hash tables to hash.c.

Published in <sxsheyq9vvl.fsf@florida.arsdigita.de>.

											
										
										
											2001-05-12 16:06:41 -04:00
+								      inl = string_set_contains (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								      /* If it is FTP, and FTP is not followed, chuck it out.  */
 								      if (!inl)
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+									if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  {
 									    DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									    string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    inl = 1;
 									  }
 								      /* If it is absolute link and they are not followed, chuck it
 									 out.  */
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+								      if (!inl && u->scheme != SCHEME_FTP)
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									if (opt.relative_only && !cur_url->link_relative_p)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  {
 									    DEBUGP (("It doesn't really look like a relative link.\n"));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									    string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    inl = 1;
 									  }
 								      /* If its domain is not to be accepted/looked-up, chuck it out.  */
 								      if (!inl)
 									if (!accept_domain (u))
 									  {
 									    DEBUGP (("I don't like the smell of that domain.\n"));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									    string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    inl = 1;
 									  }
 								      /* Check for parent directory.  */
 								      if (!inl && opt.no_parent
 									  /* If the new URL is FTP and the old was not, ignore
 								             opt.no_parent.  */
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+									  && !(!this_url_ftp && u->scheme == SCHEME_FTP))
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									{
 									  /* Check for base_dir first.  */
 									  if (!(base_dir && frontcmp (base_dir, u->dir)))
 									    {
 									      /* Failing that, check for parent dir.  */
 									      struct urlinfo *ut = newurl ();
 									      if (parseurl (this_url, ut, 0) != URLOK)
 										DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
 									      else if (!frontcmp (ut->dir, u->dir))
 										{
 										  /* Failing that too, kill the URL.  */
 										  DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+										  string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+										  inl = 1;
 										}
 									      freeurl (ut, 1);
 									    }
 									}
 								      /* If the file does not match the acceptance list, or is on the
 									 rejection list, chuck it out.  The same goes for the
 									 directory exclude- and include- lists.  */
 								      if (!inl && (opt.includes || opt.excludes))
 									{
 									  if (!accdir (u->dir, ALLABS))
 									    {
 									      DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									      string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      inl = 1;
 									    }
 									}
 								      if (!inl)
 									{
 									  char *suf = NULL;
 									  /* We check for acceptance/rejection rules only for non-HTML
 									     documents.  Since we don't know whether they really are
 									     HTML, it will be deduced from (an OR-ed list):
 ) u->file is "" (meaning it is a directory)
 ) suffix exists, AND:
 									     a) it is "html", OR
 									     b) it is "htm"
 									     If the file *is* supposed to be HTML, it will *not* be
-												[svn] Applied Edward Sabol's patch.

											
										
										
											2000-03-02 08:28:31 -05:00
+								            subject to acc/rej rules, unless a finite maximum depth has
 								            been specified and the current depth is the maximum depth. */
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  if (!
 									      (!*u->file
 									       || (((suf = suffix (constr)) != NULL)
-												[svn] Applied Edward Sabol's patch.

											
										
										
											2000-03-02 08:28:31 -05:00
+								                  && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								                      && ((opt.reclevel != INFINITE_RECURSION) &&
 											  (depth != opt.reclevel))))))
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
 									      if (!acceptable (u->file))
 										{
 										  DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 											  constr, u->file));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+										  string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+										  inl = 1;
 										}
 									    }
 									  FREE_MAYBE (suf);
 									}
 								      /* Optimize the URL (which includes possible DNS lookup) only
 									 after all other possibilities have been exhausted.  */
 								      if (!inl)
 									{
 									  if (!opt.simple_check)
 									    opt_url (u);
 									  else
 									    {
 									      char *p;
 									      /* Just lowercase the hostname.  */
 									      for (p = u->host; *p; p++)
-												[svn] Commit several fixes.

											
										
										
											2000-04-12 09:23:35 -04:00
+										*p = TOLOWER (*p);
-												[svn] Committed <sxsbsv854j9.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-22 11:58:28 -05:00
+									      xfree (u->url);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      u->url = str_url (u, 0);
 									    }
-												[svn] Committed <sxsbsv854j9.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-22 11:58:28 -05:00
+									  xfree (constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  constr = xstrdup (u->url);
-												[svn] After canonicalizing the URL, check for its existence among undesirable_urls.
Published in <sxs7kyeohte.fsf@florida.arsdigita.de>.

											
										
										
											2001-06-14 17:48:00 -04:00
+									  /* After we have canonicalized the URL, check if we have it
 									     on the black list. */
 									  if (string_set_contains (undesirable_urls, constr))
 									    inl = 1;
 									  /* This line is bogus. */
 									  /*string_set_add (undesirable_urls, constr);*/
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+									  if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp))
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    if (!opt.spanhost && this_url && !same_host (this_url, constr))
 									      {
 										DEBUGP (("This is not the same hostname as the parent's.\n"));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+										string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+										inl = 1;
 									      }
 									}
 								      /* What about robots.txt?  */
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+								      if (!inl && opt.use_robots && u->scheme == SCHEME_FTP)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									{
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+									  struct robot_specs *specs = res_get_specs (u->host, u->port);
 									  if (!specs)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+									      char *rfile;
 									      if (res_retrieve_file (constr, &rfile))
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+										{
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+										  specs = res_parse_from_file (rfile);
-												[svn] Committed <sxsbsv854j9.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-22 11:58:28 -05:00
+										  xfree (rfile);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+										}
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+									      else
 										{
 										  /* If we cannot get real specs, at least produce
 										     dummy ones so that we can register them and stop
 										     trying to retrieve them.  */
 										  specs = res_parse ("", 0);
 										}
 									      res_register_specs (u->host, u->port, specs);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    }
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+									  /* Now that we have (or don't have) robots.txt specs, we can
 									     check what they say.  */
 									  if (!res_match_path (specs, u->path))
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
-												[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-17 21:17:30 -05:00
+									      DEBUGP (("Not following %s because robots.txt forbids it.\n",
 										       constr));
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									      string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      inl = 1;
 									    }
 									}
 								      filename = NULL;
 								      /* If it wasn't chucked out, do something with it.  */
 								      if (!inl)
 									{
 									  DEBUGP (("I've decided to load it -> "));
 									  /* Add it to the list of already-loaded URL-s.  */
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									  string_set_add (undesirable_urls, constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  /* Automatically followed FTPs will *not* be downloaded
 									     recursively.  */
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+									  if (u->scheme == SCHEME_FTP)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
 									      /* Don't you adore side-effects?  */
 									      opt.recursive = 0;
 									    }
 									  /* Reset its type.  */
 									  dt = 0;
 									  /* Retrieve it.  */
 									  retrieve_url (constr, &filename, &newloc,
 										       canon_this_url ? canon_this_url : this_url, &dt);
-												[svn] Clean up handling of schemes.
Published in <sxswv0n7h7s.fsf@florida.arsdigita.de>.

											
										
										
											2001-11-18 19:12:05 -05:00
+									  if (u->scheme == SCHEME_FTP)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
 									      /* Restore...  */
 									      opt.recursive = 1;
 									    }
 									  if (newloc)
 									    {
-												[svn] Committed <sxsbsv854j9.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-22 11:58:28 -05:00
+									      xfree (constr);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      constr = newloc;
 									    }
 									  /* If there was no error, and the type is text/html, parse
 									     it recursively.  */
 									  if (dt & TEXTHTML)
 									    {
 									      if (dt & RETROKF)
 										recursive_retrieve (filename, constr);
 									    }
 									  else
 									    DEBUGP (("%s is not text/html so we don't chase.\n",
 										     filename ? filename: "(null)"));
-												[svn] --delete-after wasn't implemented for files retrieved by FTP or corresponding to
files specified on the commandline.  Made --convert-links be ignored when
--delete-after is specified.  Added note about this fact to --delete-after docs
and made general improvements to them, including the clarification that
--delete-after only deletes local files.

											
										
										
											2000-10-23 23:43:47 -04:00
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  if (opt.delete_after || (filename && !acceptable (filename)))
-												[svn] --delete-after wasn't implemented for files retrieved by FTP or corresponding to
files specified on the commandline.  Made --convert-links be ignored when
--delete-after is specified.  Added note about this fact to --delete-after docs
and made general improvements to them, including the clarification that
--delete-after only deletes local files.

											
										
										
											2000-10-23 23:43:47 -04:00
+									    /* Either --delete-after was specified, or we loaded this otherwise
 									       rejected (e.g. by -R) HTML file just so we could harvest its
 									       hyperlinks -- in either case, delete the local file. */
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
-												[svn] --delete-after wasn't implemented for files retrieved by FTP or corresponding to
files specified on the commandline.  Made --convert-links be ignored when
--delete-after is specified.  Added note about this fact to --delete-after docs
and made general improvements to them, including the clarification that
--delete-after only deletes local files.

											
										
										
											2000-10-23 23:43:47 -04:00
+									      DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 										       opt.delete_after ? "--delete-after" :
 										       "recursive rejection criteria"));
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      logprintf (LOG_VERBOSE,
 											 (opt.delete_after ? _("Removing %s.\n")
 											  : _("Removing %s since it should be rejected.\n")),
 											 filename);
 									      if (unlink (filename))
 										logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 									      dt &= ~RETROKF;
 									    }
-												[svn] --delete-after wasn't implemented for files retrieved by FTP or corresponding to
files specified on the commandline.  Made --convert-links be ignored when
--delete-after is specified.  Added note about this fact to --delete-after docs
and made general improvements to them, including the clarification that
--delete-after only deletes local files.

											
										
										
											2000-10-23 23:43:47 -04:00
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  /* If everything was OK, and links are to be converted, let's
 									     store the local filename.  */
 									  if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
 									    {
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									      cur_url->convert = CO_CONVERT_TO_RELATIVE;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									      cur_url->local_name = xstrdup (filename);
 									    }
 									}
-												[svn] My patch "persistent connection tweaks".
Published in <sxshf531qhj.fsf@florida.arsdigita.de>.

(Applied with the addition of correct calculation for the
length of the request.)

											
										
										
											2000-11-19 18:42:13 -05:00
+								      else
 									DEBUGP (("%s already in list, so we don't load.\n", constr));
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								      /* Free filename and constr.  */
 								      FREE_MAYBE (filename);
 								      FREE_MAYBE (constr);
 								      freeurl (u, 1);
 								      /* Increment the pbuf for the appropriate size.  */
 								    }
-												[svn] --delete-after wasn't implemented for files retrieved by FTP or corresponding to
files specified on the commandline.  Made --convert-links be ignored when
--delete-after is specified.  Added note about this fact to --delete-after docs
and made general improvements to them, including the clarification that
--delete-after only deletes local files.

											
										
										
											2000-10-23 23:43:47 -04:00
+								  if (opt.convert_links && !opt.delete_after)
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								    /* This is merely the first pass: the links that have been
 								       successfully downloaded are converted.  In the second pass,
 								       convert_all_links() will also convert those links that have NOT
 								       been downloaded to their canonical form.  */
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    convert_links (file, url_list);
 								  /* Free the linked list of URL-s.  */
 								  free_urlpos (url_list);
 								  /* Free the canonical this_url.  */
 								  FREE_MAYBE (canon_this_url);
 								  /* Decrement the recursion depth.  */
 								  --depth;
-												[svn] Gracefully handle opt.downloaded overflowing.
Published in <sxsd7gfnv17.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-01 18:17:31 -05:00
+								  if (downloaded_exceeds_quota ())
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    return QUOTEXC;
 								  else
 								    return RETROK;
 								}
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								void
 								register_download (const char *url, const char *file)
 								{
 								  if (!opt.convert_links)
 								    return;
 								  if (!dl_file_url_map)
 								    dl_file_url_map = make_string_hash_table (0);
 								  hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 								  if (!dl_url_file_map)
 								    dl_url_file_map = make_string_hash_table (0);
 								  hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 								}
 								void
 								register_html (const char *url, const char *file)
 								{
 								  if (!opt.convert_links)
 								    return;
 								  downloaded_html_files = slist_prepend (downloaded_html_files, file);
 								}
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+								/* convert_links() is called from recursive_retrieve() after we're
 								   done with an HTML file.  This call to convert_links is not complete
 								   because it converts only the downloaded files, and Wget cannot know
 								   which files will be downloaded afterwards.  So, if we have file
 								   fileone.html with:
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+								   <a href="/c/something.gif">
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
 								   and /c/something.gif was not downloaded because it exceeded the
 								   recursion depth, the reference will *not* be changed.
 								   However, later we can encounter /c/something.gif from an "upper"
 								   level HTML (let's call it filetwo.html), and it gets downloaded.
 								   But now we have a problem because /c/something.gif will be
 								   correctly transformed in filetwo.html, but not in fileone.html,
 								   since Wget could not have known that /c/something.gif will be
 								   downloaded in the future.
 								   This is why Wget must, after the whole retrieval, call
 								   convert_all_links to go once more through the entire list of
-												[svn] * *.{gmo,po,pot}: Regenerated after modifying wget --help output.

* ftp.c (ftp_retrieve_list): Use new INFINITE_RECURSION #define.

* html.c: htmlfindurl() now takes final `dash_p_leaf_HTML' parameter.
Wrapped some > 80-column lines.  When -p is specified and we're at a
leaf node, do not traverse <A>, <AREA>, or <LINK> tags other than
<LINK REL="stylesheet">.

* html.h (htmlfindurl): Now takes final `dash_p_leaf_HTML' parameter.

* init.c: Added new -p / --page-requisites / page_requisites option.

* main.c (print_help): Clarified that -l inf and -l 0 both allow
infinite recursion.  Changed the unhelpful --mirrior description
to simply give the options it's equivalent to.  Added new -p option.
(main): Added some comments; handle new -p / --page-requisites.

* options.h (struct options): Added new page_requisites field.

* recur.c: Changed "URL-s" to "URLs" and "HTML-s" to "HTMLs".
Calculate and pass down new `dash_p_leaf_HTML' parameter to
get_urls_html().  Use new INFINITE_RECURSION #define.

* retr.c: Changed "URL-s" to "URLs".  get_urls_html() now takes
final `dash_p_leaf_HTML' parameter.

* url.c: get_urls_html() and htmlfindurl() now take final
`dash_p_leaf_HTML' parameter.

* url.h (get_urls_html): Now takes final `dash_p_leaf_HTML' parameter.

* wget.h: Added some comments and new INFINITE_RECURSION #define.

* wget.texi (Recursive Retrieval Options): Documented new -p option.

											
										
										
											2000-08-30 07:26:21 -04:00
+								   retrieved HTMLs, and re-convert them.
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								   in urls_downloaded.  From these two lists information is
 								   extracted.  */
 								void
 								convert_all_links (void)
 								{
 								  slist *html;
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								  /* Destructively reverse downloaded_html_files to get it in the right order.
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+								     recursive_retrieve() used slist_prepend() consistently.  */
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								  downloaded_html_files = slist_nreverse (downloaded_html_files);
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
-												[svn] Record downloaded files and downloaded HTML files in all cases.
Published under the subject "Link conversion fix" in
<sxsn1a2n2zd.fsf@florida.arsdigita.de>.

											
										
										
											2001-03-30 21:05:54 -05:00
+								  for (html = downloaded_html_files; html; html = html->next)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								    {
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+								      urlpos *urls, *cur_url;
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								      char *url;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								      DEBUGP (("Rescanning %s\n", html->string));
 								      /* Determine the URL of the HTML file.  get_urls_html will need
 									 it.  */
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+								      url = hash_table_get (dl_file_url_map, html->string);
 								      if (url)
 									DEBUGP (("It should correspond to %s.\n", url));
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+								      else
 									DEBUGP (("I cannot find the corresponding URL.\n"));
 								      /* Parse the HTML file...  */
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+								      urls = get_urls_html (html->string, url, FALSE, NULL);
 								      /* We don't respect meta_disallow_follow here because, even if
 								         the file is not followed, we might still want to convert the
 								         links that have been followed from other files.  */
 								      for (cur_url = urls; cur_url; cur_url = cur_url->next)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									{
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									  char *local_name;
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  /* The URL must be in canonical form to be compared.  */
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									  struct urlinfo *u = newurl ();
 									  uerr_t res = parseurl (cur_url->url, u, 0);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  if (res != URLOK)
 									    {
 									      freeurl (u, 1);
 									      continue;
 									    }
 									  /* We decide the direction of conversion according to whether
 									     a URL was downloaded.  Downloaded URLs will be converted
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									     ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 									  local_name = hash_table_get (dl_url_file_map, u->url);
 									  if (local_name)
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									    DEBUGP (("%s marked for conversion, local %s\n",
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+										     u->url, local_name));
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									  /* Decide on the conversion direction.  */
-												[svn] A bunch of new features:

- use mmap() to read whole files in core instead of allocating memory
  and read'ing it.

- use a new, more general, HTML parser (html-parse.c) and interface to
  it from Wget (html-url.c).

- respect <meta name=robots content=nofollow> (easy with the new HTML
  parser).

- use hash tables instead of linked lists in places where the lists
  were used to facilitate mappings.

- rewrite the code in host.c to be more readable and faster (hash
  tables instead of home-grown lists.)

- make convert_links properly convert partial URLs to complete ones
  for those URLs that have *not* been downloaded.

- use HTTP persistent connections where available.  very
  simple-minded, caches the last connection to the server.

Published in <sxshf533d5r.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-19 15:50:10 -05:00
+									  if (local_name)
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    {
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									      /* We've downloaded this URL.  Convert it to relative
 								                 form.  We do this even if the URL already is in
 								                 relative form, because our directory structure may
 								                 not be identical to that on the server (think `-nd',
 								                 `--cut-dirs', etc.)  */
 									      cur_url->convert = CO_CONVERT_TO_RELATIVE;
 									      cur_url->local_name = xstrdup (local_name);
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    }
 									  else
 									    {
-												[svn] Committed a bunch of different tweaks of mine.
Published in <sxsr9463wrx.fsf@florida.arsdigita.de>.

											
										
										
											2000-11-20 21:06:36 -05:00
+									      /* We haven't downloaded this URL.  If it's not already
 								                 complete (including a full host name), convert it to
 								                 that form, so it can be reached while browsing this
 								                 HTML locally.  */
 									      if (!cur_url->link_complete_p)
 										cur_url->convert = CO_CONVERT_TO_COMPLETE;
 									      cur_url->local_name = NULL;
-												[svn] Initial revision

											
										
										
											1999-12-02 02:42:23 -05:00
+									    }
 									  freeurl (u, 1);
 									}
 								      /* Convert the links in the file.  */
 								      convert_links (html->string, urls);
 								      /* Free the data.  */
 								      free_urlpos (urls);
 								    }
 								}