1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Use new function to test filename for common html suffixes.

Submitted by Ian Abbott in <3CB72D29.4898.1F34872@localhost> with minor
changes to formatting and comments.
This commit is contained in:
abbotti 2002-04-12 11:53:39 -07:00
parent 2a72eef0af
commit cfd7b9a951
6 changed files with 48 additions and 12 deletions

View File

@ -1,3 +1,16 @@
2002-04-12 Ian Abbott <abbotti@mev.co.uk>
* utils.c (has_html_suffix_p): New function to test filename for
common html extensions.
* utils.h: Declare it.
* http.c (http_loop): Use it instead of previous test.
* retr.c (retrieve_url): Ditto.
* recur.c (download_child_p): Ditto.
2002-04-12 Hrvoje Niksic <hniksic@arsdigita.com> 2002-04-12 Hrvoje Niksic <hniksic@arsdigita.com>
* config.h.in: Define _VA_LIST on Solaris to prevent stdio.h from * config.h.in: Define _VA_LIST on Solaris to prevent stdio.h from

View File

@ -1405,7 +1405,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
int use_ts, got_head = 0; /* time-stamping info */ int use_ts, got_head = 0; /* time-stamping info */
char *filename_plus_orig_suffix; char *filename_plus_orig_suffix;
char *local_filename = NULL; char *local_filename = NULL;
char *tms, *suf, *locf, *tmrate; char *tms, *locf, *tmrate;
uerr_t err; uerr_t err;
time_t tml = -1, tmr = -1; /* local and remote time-stamps */ time_t tml = -1, tmr = -1; /* local and remote time-stamps */
long local_size = 0; /* the size of the local file */ long local_size = 0; /* the size of the local file */
@ -1465,9 +1465,8 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file);
*dt |= RETROKF; *dt |= RETROKF;
/* #### Bogusness alert. */ /* #### Bogusness alert. */
/* If its suffix is "html" or "htm", assume text/html. */ /* If its suffix is "html" or "htm" or similar, assume text/html. */
if (((suf = suffix (*hstat.local_file)) != NULL) if (has_html_suffix_p (*hstat.local_file))
&& (!strcmp (suf, "html") || !strcmp (suf, "htm")))
*dt |= TEXTHTML; *dt |= TEXTHTML;
FREE_MAYBE (dummy); FREE_MAYBE (dummy);

View File

@ -510,7 +510,6 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
/* 6. */ /* 6. */
{ {
char *suf;
/* Check for acceptance/rejection rules. We ignore these rules /* Check for acceptance/rejection rules. We ignore these rules
for HTML documents because they might lead to other files which for HTML documents because they might lead to other files which
need to be downloaded. Of course, we don't know which need to be downloaded. Of course, we don't know which
@ -521,14 +520,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
* u->file is not "" (i.e. it is not a directory) * u->file is not "" (i.e. it is not a directory)
and either: and either:
+ there is no file suffix, + there is no file suffix,
+ or there is a suffix, but is not "html" or "htm", + or there is a suffix, but is not "html" or "htm" or similar,
+ both: + both:
- recursion is not infinite, - recursion is not infinite,
- and we are at its very end. */ - and we are at its very end. */
if (u->file[0] != '\0' if (u->file[0] != '\0'
&& ((suf = suffix (url)) == NULL && (!has_html_suffix_p (url)
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel))) || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
{ {
if (!acceptable (u->file)) if (!acceptable (u->file))

View File

@ -384,12 +384,11 @@ retrieve_url (const char *origurl, char **file, char **newloc,
/* There is a possibility of having HTTP being redirected to /* There is a possibility of having HTTP being redirected to
FTP. In these cases we must decide whether the text is HTML FTP. In these cases we must decide whether the text is HTML
according to the suffix. The HTML suffixes are `.html' and according to the suffix. The HTML suffixes are `.html',
`.htm', case-insensitive. */ `.htm' and a few others, case-insensitive. */
if (redirection_count && local_file && u->scheme == SCHEME_FTP) if (redirection_count && local_file && u->scheme == SCHEME_FTP)
{ {
char *suf = suffix (local_file); if (has_html_suffix_p (local_file))
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
*dt |= TEXTHTML; *dt |= TEXTHTML;
} }
} }

View File

@ -792,6 +792,31 @@ suffix (const char *str)
return NULL; return NULL;
} }
/* Return non-zero if FNAME ends with a typical HTML suffix. The
following (case-insensitive) suffixes are presumed to be HTML files:
html
htm
?html (`?' matches one character)
#### CAVEAT. This is not necessarily a good indication that FNAME
refers to a file that contains HTML! */
int
has_html_suffix_p (const char *fname)
{
char *suf;
if ((suf = suffix (fname)) == NULL)
return 0;
if (!strcasecmp (suf, "html"))
return 1;
if (!strcasecmp (suf, "htm"))
return 1;
if (suf[0] && !strcasecmp (suf + 1, "html"))
return 1;
return 0;
}
/* Read a line from FP and return the pointer to freshly allocated /* Read a line from FP and return the pointer to freshly allocated
storage. The stoarage space is obtained through malloc() and storage. The stoarage space is obtained through malloc() and
should be freed with free() when it is no longer needed. should be freed with free() when it is no longer needed.

View File

@ -70,6 +70,8 @@ int accdir PARAMS ((const char *s, enum accd));
char *suffix PARAMS ((const char *s)); char *suffix PARAMS ((const char *s));
int match_tail PARAMS ((const char *, const char *)); int match_tail PARAMS ((const char *, const char *));
int has_html_suffix_p PARAMS ((const char *));
char *read_whole_line PARAMS ((FILE *)); char *read_whole_line PARAMS ((FILE *));
struct file_memory *read_file PARAMS ((const char *)); struct file_memory *read_file PARAMS ((const char *));
void read_file_free PARAMS ((struct file_memory *)); void read_file_free PARAMS ((struct file_memory *));