mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Use new function to test filename for common html suffixes.
Submitted by Ian Abbott in <3CB72D29.4898.1F34872@localhost> with minor changes to formatting and comments.
This commit is contained in:
parent
2a72eef0af
commit
cfd7b9a951
@ -1,3 +1,16 @@
|
|||||||
|
2002-04-12 Ian Abbott <abbotti@mev.co.uk>
|
||||||
|
|
||||||
|
* utils.c (has_html_suffix_p): New function to test filename for
|
||||||
|
common html extensions.
|
||||||
|
|
||||||
|
* utils.h: Declare it.
|
||||||
|
|
||||||
|
* http.c (http_loop): Use it instead of previous test.
|
||||||
|
|
||||||
|
* retr.c (retrieve_url): Ditto.
|
||||||
|
|
||||||
|
* recur.c (download_child_p): Ditto.
|
||||||
|
|
||||||
2002-04-12 Hrvoje Niksic <hniksic@arsdigita.com>
|
2002-04-12 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||||
|
|
||||||
* config.h.in: Define _VA_LIST on Solaris to prevent stdio.h from
|
* config.h.in: Define _VA_LIST on Solaris to prevent stdio.h from
|
||||||
|
@ -1405,7 +1405,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
|
|||||||
int use_ts, got_head = 0; /* time-stamping info */
|
int use_ts, got_head = 0; /* time-stamping info */
|
||||||
char *filename_plus_orig_suffix;
|
char *filename_plus_orig_suffix;
|
||||||
char *local_filename = NULL;
|
char *local_filename = NULL;
|
||||||
char *tms, *suf, *locf, *tmrate;
|
char *tms, *locf, *tmrate;
|
||||||
uerr_t err;
|
uerr_t err;
|
||||||
time_t tml = -1, tmr = -1; /* local and remote time-stamps */
|
time_t tml = -1, tmr = -1; /* local and remote time-stamps */
|
||||||
long local_size = 0; /* the size of the local file */
|
long local_size = 0; /* the size of the local file */
|
||||||
@ -1465,9 +1465,8 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file);
|
|||||||
*dt |= RETROKF;
|
*dt |= RETROKF;
|
||||||
|
|
||||||
/* #### Bogusness alert. */
|
/* #### Bogusness alert. */
|
||||||
/* If its suffix is "html" or "htm", assume text/html. */
|
/* If its suffix is "html" or "htm" or similar, assume text/html. */
|
||||||
if (((suf = suffix (*hstat.local_file)) != NULL)
|
if (has_html_suffix_p (*hstat.local_file))
|
||||||
&& (!strcmp (suf, "html") || !strcmp (suf, "htm")))
|
|
||||||
*dt |= TEXTHTML;
|
*dt |= TEXTHTML;
|
||||||
|
|
||||||
FREE_MAYBE (dummy);
|
FREE_MAYBE (dummy);
|
||||||
|
@ -510,7 +510,6 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
|
|
||||||
/* 6. */
|
/* 6. */
|
||||||
{
|
{
|
||||||
char *suf;
|
|
||||||
/* Check for acceptance/rejection rules. We ignore these rules
|
/* Check for acceptance/rejection rules. We ignore these rules
|
||||||
for HTML documents because they might lead to other files which
|
for HTML documents because they might lead to other files which
|
||||||
need to be downloaded. Of course, we don't know which
|
need to be downloaded. Of course, we don't know which
|
||||||
@ -521,14 +520,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
* u->file is not "" (i.e. it is not a directory)
|
* u->file is not "" (i.e. it is not a directory)
|
||||||
and either:
|
and either:
|
||||||
+ there is no file suffix,
|
+ there is no file suffix,
|
||||||
+ or there is a suffix, but is not "html" or "htm",
|
+ or there is a suffix, but is not "html" or "htm" or similar,
|
||||||
+ both:
|
+ both:
|
||||||
- recursion is not infinite,
|
- recursion is not infinite,
|
||||||
- and we are at its very end. */
|
- and we are at its very end. */
|
||||||
|
|
||||||
if (u->file[0] != '\0'
|
if (u->file[0] != '\0'
|
||||||
&& ((suf = suffix (url)) == NULL
|
&& (!has_html_suffix_p (url)
|
||||||
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
|
|
||||||
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
|
|| (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
|
||||||
{
|
{
|
||||||
if (!acceptable (u->file))
|
if (!acceptable (u->file))
|
||||||
|
@ -384,12 +384,11 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
|||||||
|
|
||||||
/* There is a possibility of having HTTP being redirected to
|
/* There is a possibility of having HTTP being redirected to
|
||||||
FTP. In these cases we must decide whether the text is HTML
|
FTP. In these cases we must decide whether the text is HTML
|
||||||
according to the suffix. The HTML suffixes are `.html' and
|
according to the suffix. The HTML suffixes are `.html',
|
||||||
`.htm', case-insensitive. */
|
`.htm' and a few others, case-insensitive. */
|
||||||
if (redirection_count && local_file && u->scheme == SCHEME_FTP)
|
if (redirection_count && local_file && u->scheme == SCHEME_FTP)
|
||||||
{
|
{
|
||||||
char *suf = suffix (local_file);
|
if (has_html_suffix_p (local_file))
|
||||||
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
|
|
||||||
*dt |= TEXTHTML;
|
*dt |= TEXTHTML;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
25
src/utils.c
25
src/utils.c
@ -792,6 +792,31 @@ suffix (const char *str)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return non-zero if FNAME ends with a typical HTML suffix. The
|
||||||
|
following (case-insensitive) suffixes are presumed to be HTML files:
|
||||||
|
|
||||||
|
html
|
||||||
|
htm
|
||||||
|
?html (`?' matches one character)
|
||||||
|
|
||||||
|
#### CAVEAT. This is not necessarily a good indication that FNAME
|
||||||
|
refers to a file that contains HTML! */
|
||||||
|
int
|
||||||
|
has_html_suffix_p (const char *fname)
|
||||||
|
{
|
||||||
|
char *suf;
|
||||||
|
|
||||||
|
if ((suf = suffix (fname)) == NULL)
|
||||||
|
return 0;
|
||||||
|
if (!strcasecmp (suf, "html"))
|
||||||
|
return 1;
|
||||||
|
if (!strcasecmp (suf, "htm"))
|
||||||
|
return 1;
|
||||||
|
if (suf[0] && !strcasecmp (suf + 1, "html"))
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* Read a line from FP and return the pointer to freshly allocated
|
/* Read a line from FP and return the pointer to freshly allocated
|
||||||
storage. The stoarage space is obtained through malloc() and
|
storage. The stoarage space is obtained through malloc() and
|
||||||
should be freed with free() when it is no longer needed.
|
should be freed with free() when it is no longer needed.
|
||||||
|
@ -70,6 +70,8 @@ int accdir PARAMS ((const char *s, enum accd));
|
|||||||
char *suffix PARAMS ((const char *s));
|
char *suffix PARAMS ((const char *s));
|
||||||
int match_tail PARAMS ((const char *, const char *));
|
int match_tail PARAMS ((const char *, const char *));
|
||||||
|
|
||||||
|
int has_html_suffix_p PARAMS ((const char *));
|
||||||
|
|
||||||
char *read_whole_line PARAMS ((FILE *));
|
char *read_whole_line PARAMS ((FILE *));
|
||||||
struct file_memory *read_file PARAMS ((const char *));
|
struct file_memory *read_file PARAMS ((const char *));
|
||||||
void read_file_free PARAMS ((struct file_memory *));
|
void read_file_free PARAMS ((struct file_memory *));
|
||||||
|
Loading…
Reference in New Issue
Block a user