[svn] Ignore -np when in -p mode.

Published in <sxsg06w2c52.fsf@florida.arsdigita.de>.
2024-07-03 16:38:41 -04:00 · 2001-11-30 13:17:53 -08:00 · 2001-11-30 13:17:53 -08:00 · a4db28e20f
commit a4db28e20f
parent 5deec11234
6 changed files with 57 additions and 59 deletions
--- a/6
+++ b/6
@ -17,12 +17,6 @@ changes.

 * -p should probably go "_two_ more hops" on <FRAMESET> pages.

-* Only normal link-following recursion should respect -np.  Page-requisite
-  recursion should not.  When -np -p is specified, Wget should still retrieve
-  requisite images and such on the server, even if they aren't in that directory
-  or a subdirectory of it.  Likewise, -H -np -p should retrieve requisite files
-  from other hosts. 
-
 * Add a --range parameter allowing you to explicitly specify a range of bytes to
  get from a file over HTTP (FTP only supports ranges ending at the end of the
  file, though forcibly disconnecting from the server at the desired endpoint
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,16 @@
+2001-11-30  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* recur.c (retrieve_tree): Skip the non-inline entries when
+	enqueuing the children of a leaf HTML node in -p mode.
+	(descend_url_p): Ignore opt.no_parent when in -p mode and UPOS is
+	"inline".
+
+	* html-url.c (get_urls_html): Don't accept dash_p_leaf_HTML.
+	(collect_tags_mapper): When an entry is "inline", mark it as such.
+
+	* recur.c (descend_url_p): Fix test when checking for
+	acceptance/rejection rules.
+
 2001-10-31 Daniel BODEA <dali@dali-designs.com>

 	* netrc.c (search_netrc): When slack_default is 0, still look for
--- a/src/html-url.c
+++ b/src/html-url.c
@ -287,9 +287,6 @@ struct collect_urls_closure {
  struct urlpos *head, *tail;	/* List of URLs */
  const char *parent_base;	/* Base of the current document. */
  const char *document_file;	/* File name of this document. */
-  int dash_p_leaf_HTML;		/* Whether -p is specified, and this
-                                   document is the "leaf" node of the
-                                   HTML tree. */
  int nofollow;			/* whether NOFOLLOW was specified in a
                                   <meta name=robots> tag. */
 };
@ -413,20 +410,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 	    for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
 		 i++)
 	      {
-		char *attr_value;
-		if (closure->dash_p_leaf_HTML
-		    && (url_tag_attr_map[i].flags & AF_EXTERNAL))
-		  /* If we're at a -p leaf node, we don't want to retrieve
-		     links to references we know are external to this document,
-		     such as <a href=...>.  */
-		  continue;
-
-		if (!strcasecmp (tag->attrs[id].name,
+		if (0 == strcasecmp (tag->attrs[id].name,
 				     url_tag_attr_map[i].attr_name))
 		  {
-		    attr_value = tag->attrs[id].value;
+		    char *attr_value = tag->attrs[id].value;
 		    if (attr_value)
-		      handle_link (closure, attr_value, tag, id);
+		      {
+			struct urlpos *entry;
+			entry = handle_link (closure, attr_value, tag, id);
+			if (entry != NULL
+			    && !(url_tag_attr_map[i].flags & AF_EXTERNAL))
+			  entry->link_inline_p = 1;
+		      }
 		  }
 	      }
 	  }
@ -460,24 +455,20 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 	case TAG_LINK:
 	  {
 	    int id;
-	    char *rel  = find_attr (tag, "rel", NULL);
 	    char *href = find_attr (tag, "href", &id);
+
+	    /* All <link href="..."> link references are external,
+	       except for <link rel="stylesheet" href="...">.  */
 	    if (href)
 	      {
-		/* In the normal case, all <link href=...> tags are
-		   fair game.
-
-		   In the special case of when -p is active, however,
-		   and we're at a leaf node (relative to the -l
-		   max. depth) in the HTML document tree, the only
-		   <LINK> tag we'll follow is a <LINK REL=
-		   "stylesheet">, as it'll be necessary for displaying
-		   this document properly.  We won't follow other
-		   <LINK> tags, like <LINK REL="home">, for instance,
-		   as they refer to external documents.  */
-		if (!closure->dash_p_leaf_HTML
-		    || (rel && !strcasecmp (rel, "stylesheet")))
-		  handle_link (closure, href, tag, id);
+		struct urlpos *entry;
+		entry = handle_link (closure, href, tag, id);
+		if (entry != NULL)
+		  {
+		    char *rel  = find_attr (tag, "rel", NULL);
+		    if (rel && 0 == strcasecmp (rel, "stylesheet"))
+		      entry->link_inline_p = 1;
+		  }
 	      }
 	  }
 	  break;
@ -557,13 +548,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg)

 /* Analyze HTML tags FILE and construct a list of URLs referenced from
   it.  It merges relative links in FILE with URL.  It is aware of
-   <base href=...> and does the right thing.
-
-   If dash_p_leaf_HTML is non-zero, only the elements needed to render
-   FILE ("non-external" links) will be returned.  */
+   <base href=...> and does the right thing.  */
 struct urlpos *
-get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
-	       int *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
 {
  struct file_memory *fm;
  struct collect_urls_closure closure;
@ -582,7 +569,6 @@ get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
  closure.base = NULL;
  closure.parent_base = url ? url : opt.base_href;
  closure.document_file = file;
-  closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
  closure.nofollow = 0;

  if (!interesting_tags)
--- a/src/recur.c
+++ b/src/recur.c
@ -279,8 +279,8 @@ retrieve_tree (const char *start_url)
      if (descend)
 	{
 	  int meta_disallow_follow = 0;
-	  struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
-						   &meta_disallow_follow);
+	  struct urlpos *children
+	    = get_urls_html (file, url, &meta_disallow_follow);

 	  if (opt.use_robots && meta_disallow_follow)
 	    {
@ -298,6 +298,8 @@ retrieve_tree (const char *start_url)
 		{
 		  if (child->ignore_when_downloading)
 		    continue;
+		  if (dash_p_leaf_HTML && !child->link_inline_p)
+		    continue;
 		  if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 				     blacklist))
 		    {
@ -435,11 +437,13 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  /* 4. Check for parent directory.

     If we descended to a different host or changed the scheme, ignore
-     opt.no_parent.  Also ignore it for -p leaf retrievals.  */
+     opt.no_parent.  Also ignore it for documents needed to display
+     the parent page when in -p mode.  */
  if (opt.no_parent
      && u->scheme == start_url_parsed->scheme
      && 0 == strcasecmp (u->host, start_url_parsed->host)
-      && u->port == start_url_parsed->port)
+      && u->port == start_url_parsed->port
+      && !(opt.page_requisites && upos->link_inline_p))
    {
      if (!frontcmp (start_url_parsed->dir, u->dir))
 	{
@ -482,7 +486,7 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
    if (u->file[0] != '\0'
 	&& ((suf = suffix (url)) == NULL
 	    || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
-	    || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+	    || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
      {
 	if (!acceptable (u->file))
 	  {
@ -674,7 +678,7 @@ convert_all_links (void)
 	DEBUGP (("I cannot find the corresponding URL.\n"));

      /* Parse the HTML file...  */
-      urls = get_urls_html (html->string, url, FALSE, NULL);
+      urls = get_urls_html (html->string, url, NULL);

      /* We don't respect meta_disallow_follow here because, even if
         the file is not followed, we might still want to convert the
--- a/src/retr.c
+++ b/src/retr.c
@ -535,7 +535,7 @@ retrieve_from_file (const char *file, int html, int *count)
  uerr_t status;
  struct urlpos *url_list, *cur_url;

-  url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
+  url_list = (html ? get_urls_html (file, NULL, NULL)
 	      : get_urls_file (file));
  status = RETROK;             /* Suppose everything is OK.  */
  *count = 0;                  /* Reset the URL count.  */
--- a/src/url.h
+++ b/src/url.h
@ -79,16 +79,17 @@ struct urlpos {
  char *local_name;		/* local file to which it was saved
 				   (used by convert_links) */

-  int ignore_when_downloading;	/* reserved for special links such as
-				   <base href="..."> which are used
-				   when converting links, but ignored
-				   when downloading.  */
+  /* reserved for special links such as <base href="..."> which are
+     used when converting links, but ignored when downloading.  */
+  unsigned int ignore_when_downloading	:1;

  /* Information about the original link: */
-  int link_relative_p;		/* was the link relative? */
-  int link_complete_p;		/* was the link complete (with the
+
+  unsigned int link_relative_p	:1; /* was the link relative? */
+  unsigned int link_complete_p	:1; /* was the link complete (with the
 				       host name, etc.) */
-  int link_base_p;		/* was the link <base href=...> */
+  unsigned int link_base_p	:1; /* was the link <base href=...> */
+  unsigned int link_inline_p	:1; /* needed to render the page. */

  /* Conversion requirements: */
  enum convert_options convert;	/* is conversion required? */
@ -134,7 +135,7 @@ int url_skip_uname PARAMS ((const char *));
 char *url_string PARAMS ((const struct url *, int));

 struct urlpos *get_urls_file PARAMS ((const char *));
-struct urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
+struct urlpos *get_urls_html PARAMS ((const char *, const char *, int *));
 void free_urlpos PARAMS ((struct urlpos *));

 char *uri_merge PARAMS ((const char *, const char *));