[svn] Ignore -np when in -p mode.

Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
2024-07-03 16:38:41 -04:00 · 2001-11-30 13:17:53 -08:00 · 2001-11-30 13:17:53 -08:00 · a4db28e20f
commit a4db28e20f
parent 5deec11234
6 changed files with 57 additions and 59 deletions
--- a/6
+++ b/6
@ -17,12 +17,6 @@ changes.
 * -p should probably go "_two_ more hops" on <FRAMESET> pages.
 * Only normal link-following recursion should respect -np.  Page-requisite
  recursion should not.  When -np -p is specified, Wget should still retrieve
  requisite images and such on the server, even if they aren't in that directory
  or a subdirectory of it.  Likewise, -H -np -p should retrieve requisite files
  from other hosts. 
 * Add a --range parameter allowing you to explicitly specify a range of bytes to
  get from a file over HTTP (FTP only supports ranges ending at the end of the
  file, though forcibly disconnecting from the server at the desired endpoint
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,16 @@
 2001-11-30  Hrvoje Niksic  <hniksic@arsdigita.com>
 	* recur.c (retrieve_tree): Skip the non-inline entries when
 	enqueuing the children of a leaf HTML node in -p mode.
 	(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
 	"inline".
 	* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
 	(collect_tags_mapper): When an entry is "inline", mark it as such.
 	* recur.c (descend_url_p): Fix test when checking for
 	acceptance/rejection rules.
 2001-10-31 Daniel BODEA <dali@dali-designs.com>
 	* netrc.c (search_netrc): When slack_default is 0, still look for
--- a/src/html-url.c
+++ b/src/html-url.c
@ -287,9 +287,6 @@ struct collect_urls_closure {
  struct urlpos *head, *tail;	/* List of URLs */
  const char *parent_base;	/* Base of the current document. */
  const char *document_file;	/* File name of this document. */
  int dash_p_leaf_HTML;		/* Whether -p is specified, and this
                                   document is the "leaf" node of the
                                   HTML tree. */
  int nofollow;			/* whether NOFOLLOW was specified in a
                                   <meta name=robots> tag. */
 };
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 	    for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
 		 i++)
 	      {
-		char *attr_value;
+		if (0 == strcasecmp (tag->attrs[id].name,
-		if (closure->dash_p_leaf_HTML
+				     url_tag_attr_map[i].attr_name))
 		    && (url_tag_attr_map[i].flags & AF_EXTERNAL))
 		  /* If we're at a -p leaf node, we don't want to retrieve
 		     links to references we know are external to this document,
 		     such as <a href=...>.  */
 		  continue;
 		if (!strcasecmp (tag->attrs[id].name,
 				 url_tag_attr_map[i].attr_name))
 		  {
-		    attr_value = tag->attrs[id].value;
+		    char *attr_value = tag->attrs[id].value;
 		    if (attr_value)
-		      handle_link (closure, attr_value, tag, id);
+		      {
 			struct urlpos *entry;
 			entry = handle_link (closure, attr_value, tag, id);
 			if (entry != NULL
 			    && !(url_tag_attr_map[i].flags & AF_EXTERNAL))
 			  entry->link_inline_p = 1;
 		      }
 		  }
 	      }
 	  }
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 	case TAG_LINK:
 	  {
 	    int id;
 	    char *rel  = find_attr (tag, "rel", NULL);
 	    char *href = find_attr (tag, "href", &id);
 	    /* All <link href="..."> link references are external,
 	       except for <link rel="stylesheet" href="...">.  */
 	    if (href)
 	      {
-		/* In the normal case, all <link href=...> tags are
+		struct urlpos *entry;
-		   fair game.
+		entry = handle_link (closure, href, tag, id);
-
+		if (entry != NULL)
-		   In the special case of when -p is active, however,
+		  {
-		   and we're at a leaf node (relative to the -l
+		    char *rel  = find_attr (tag, "rel", NULL);
-		   max. depth) in the HTML document tree, the only
+		    if (rel && 0 == strcasecmp (rel, "stylesheet"))
-		   <LINK> tag we'll follow is a <LINK REL=
+		      entry->link_inline_p = 1;
-		   "stylesheet">, as it'll be necessary for displaying
+		  }
 		   this document properly.  We won't follow other
 		   <LINK> tags, like <LINK REL="home">, for instance,
 		   as they refer to external documents.  */
 		if (!closure->dash_p_leaf_HTML
 		    || (rel && !strcasecmp (rel, "stylesheet")))
 		  handle_link (closure, href, tag, id);
 	      }
 	  }
 	  break;
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 /* Analyze HTML tags FILE and construct a list of URLs referenced from
   it.  It merges relative links in FILE with URL.  It is aware of
-   <base href=...> and does the right thing.
+   <base href=...> and does the right thing.  */
   If dash_p_leaf_HTML is non-zero, only the elements needed to render
   FILE ("non-external" links) will be returned.  */
 struct urlpos *
-get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
+get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 	       int *meta_disallow_follow)
 {
  struct file_memory *fm;
  struct collect_urls_closure closure;
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
  closure.base = NULL;
  closure.parent_base = url ? url : opt.base_href;
  closure.document_file = file;
  closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
  closure.nofollow = 0;
  if (!interesting_tags)
--- a/src/recur.c
+++ b/src/recur.c
@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
      if (descend)
 	{
 	  int meta_disallow_follow = 0;
-	  struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
+	  struct urlpos *children
-						   &meta_disallow_follow);
+	    = get_urls_html (file, url, &meta_disallow_follow);
 	  if (opt.use_robots && meta_disallow_follow)
 	    {
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
 		{
 		  if (child->ignore_when_downloading)
 		    continue;
 		  if (dash_p_leaf_HTML && !child->link_inline_p)
 		    continue;
 		  if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 				     blacklist))
 		    {
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  /* 4. Check for parent directory.
     If we descended to a different host or changed the scheme, ignore
-     opt.no_parent.  Also ignore it for -p leaf retrievals.  */
+     opt.no_parent.  Also ignore it for documents needed to display
     the parent page when in -p mode.  */
  if (opt.no_parent
      && u->scheme == start_url_parsed->scheme
      && 0 == strcasecmp (u->host, start_url_parsed->host)
-      && u->port == start_url_parsed->port)
+      && u->port == start_url_parsed->port
      && !(opt.page_requisites && upos->link_inline_p))
    {
      if (!frontcmp (start_url_parsed->dir, u->dir))
 	{
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
    if (u->file[0] != '\0'
 	&& ((suf = suffix (url)) == NULL
 	    || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
-	    || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+	    || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
      {
 	if (!acceptable (u->file))
 	  {
@ -674,7 +678,7 @@ convert_all_links (void)
 	DEBUGP (("I cannot find the corresponding URL.\n"));
      /* Parse the HTML file...  */
-      urls = get_urls_html (html->string, url, FALSE, NULL);
+      urls = get_urls_html (html->string, url, NULL);
      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
--- a/src/retr.c
+++ b/src/retr.c
@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
  uerr_t status;
  struct urlpos *url_list, *cur_url;
-  url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
+  url_list = (html ? get_urls_html (file, NULL, NULL)
 	      : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
--- a/src/url.h
+++ b/src/url.h
@ -79,16 +79,17 @@ struct urlpos {
  char *local_name;		/* local file to which it was saved
 				   (used by convert_links) */
-  int ignore_when_downloading;	/* reserved for special links such as
+  /* reserved for special links such as <base href="..."> which are
-				   <base href="..."> which are used
+     used when converting links, but ignored when downloading.  */
-				   when converting links, but ignored
+  unsigned int ignore_when_downloading	:1;
 				   when downloading.  */
  /* Information about the original link: */
-  int link_relative_p;		/* was the link relative? */
+
-  int link_complete_p;		/* was the link complete (with the
+  unsigned int link_relative_p	:1; /* was the link relative? */
-                                   host name, etc.) */
+  unsigned int link_complete_p	:1; /* was the link complete (with the
-  int link_base_p;		/* was the link <base href=...> */
+				       host name, etc.) */
  unsigned int link_base_p	:1; /* was the link <base href=...> */
  unsigned int link_inline_p	:1; /* needed to render the page. */
  /* Conversion requirements: */
  enum convert_options convert;	/* is conversion required? */
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
 char *url_string PARAMS ((const struct url *, int));
 struct urlpos *get_urls_file PARAMS ((const char *));
-struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
+struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
 void free_urlpos PARAMS ((struct urlpos *));
 char *uri_merge PARAMS ((const char *, const char *));