diff --git a/configure.ac b/configure.ac
index a49de3cd..cf201aea 100644
--- a/configure.ac
+++ b/configure.ac
@@ -113,6 +113,8 @@ md5_EARLY
AC_PROG_RANLIB
+AC_PROG_LEX
+
dnl Turn on optimization by default. Specifically:
dnl
dnl if the user hasn't specified CFLAGS, then
diff --git a/src/Makefile.am b/src/Makefile.am
index f598d908..2403f671 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -35,13 +35,15 @@ DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(local
LIBS = @LIBS@ @LIBSSL@ @LIBGNUTLS@ @LIBINTL@
bin_PROGRAMS = wget
-wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c ftp-basic.c \
+wget_SOURCES = cmpt.c connect.c convert.c cookies.c \
+ css.lex css-url.c \
+ ftp.c ftp-basic.c \
ftp-ls.c hash.c host.c html-parse.c html-url.c http.c \
init.c log.c main.c netrc.c progress.c ptimer.c recur.c \
res.c retr.c snprintf.c spider.c url.c \
utils.c xmalloc.c \
- connect.h convert.h cookies.h \
- ftp.h gen-md5.h hash.h host.h html-parse.h \
+ css-url.h connect.h convert.h cookies.h \
+ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
diff --git a/src/convert.c b/src/convert.c
index 2811bff7..4f90bb3b 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -45,50 +45,37 @@ as that of the covered work. */
#include "hash.h"
#include "ptimer.h"
#include "res.h"
+#include "html-url.h"
+#include "css-url.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
-/* Set of HTML files downloaded in this Wget run, used for link
+/* Set of HTML/CSS files downloaded in this Wget run, used for link
conversion after Wget is done. */
struct hash_table *downloaded_html_set;
+struct hash_table *downloaded_css_set;
static void convert_links (const char *, struct urlpos *);
-/* This function is called when the retrieval is done to convert the
- links that have been downloaded. It has to be called at the end of
- the retrieval, because only then does Wget know conclusively which
- URLs have been downloaded, and which not, so it can tell which
- direction to convert to.
-
- The "direction" means that the URLs to the files that have been
- downloaded get converted to the relative URL which will point to
- that file. And the other URLs get converted to the remote URL on
- the server.
-
- All the downloaded HTMLs are kept in downloaded_html_files, and
- downloaded URLs in urls_downloaded. All the information is
- extracted from these two lists. */
void
-convert_all_links (void)
+convert_links_in_hashtable (struct hash_table *downloaded_set,
+ int is_css,
+ int *file_count)
{
int i;
- double secs;
- int file_count = 0;
-
- struct ptimer *timer = ptimer_new ();
int cnt;
char **file_array;
cnt = 0;
- if (downloaded_html_set)
- cnt = hash_table_count (downloaded_html_set);
+ if (downloaded_set)
+ cnt = hash_table_count (downloaded_set);
if (cnt == 0)
goto cleanup;
file_array = alloca_array (char *, cnt);
- string_set_to_array (downloaded_html_set, file_array);
+ string_set_to_array (downloaded_set, file_array);
for (i = 0; i < cnt; i++)
{
@@ -96,7 +83,7 @@ convert_all_links (void)
char *url;
char *file = file_array[i];
- /* Determine the URL of the HTML file. get_urls_html will need
+ /* Determine the URL of the file. get_urls_{html,css} will need
it. */
url = hash_table_get (dl_file_url_map, file);
if (!url)
@@ -107,8 +94,9 @@ convert_all_links (void)
DEBUGP (("Scanning %s (from %s)\n", file, url));
- /* Parse the HTML file... */
- urls = get_urls_html (file, url, NULL);
+ /* Parse the file... */
+ urls = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
@@ -160,11 +148,38 @@ convert_all_links (void)
/* Convert the links in the file. */
convert_links (file, urls);
- ++file_count;
+ ++*file_count;
/* Free the data. */
free_urlpos (urls);
}
+}
+
+/* This function is called when the retrieval is done to convert the
+ links that have been downloaded. It has to be called at the end of
+ the retrieval, because only then does Wget know conclusively which
+ URLs have been downloaded, and which not, so it can tell which
+ direction to convert to.
+
+ The "direction" means that the URLs to the files that have been
+ downloaded get converted to the relative URL which will point to
+ that file. And the other URLs get converted to the remote URL on
+ the server.
+
+ All the downloaded HTMLs are kept in downloaded_html_files, and
+ downloaded URLs in urls_downloaded. All the information is
+ extracted from these two lists. */
+
+void
+convert_all_links (void)
+{
+ double secs;
+ int file_count = 0;
+
+ struct ptimer *timer = ptimer_new ();
+
+ convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
+ convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
secs = ptimer_measure (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
@@ -174,13 +189,14 @@ cleanup:
}
static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_plain (const char*, int, FILE*, const char *);
static const char *replace_attr (const char *, int, FILE *, const char *);
static const char *replace_attr_refresh_hack (const char *, int, FILE *,
const char *, int);
static char *local_quote_string (const char *);
static char *construct_relative (const char *, const char *);
-/* Change the links in one HTML file. LINKS is a list of links in the
+/* Change the links in one file. LINKS is a list of links in the
document, along with their positions and the desired direction of
the conversion. */
static void
@@ -277,7 +293,9 @@ convert_links (const char *file, struct urlpos *links)
char *newname = construct_relative (file, link->local_name);
char *quoted_newname = local_quote_string (newname);
- if (!link->link_refresh_p)
+ if (link->link_css_p)
+ p = replace_plain (p, link->size, fp, quoted_newname);
+ else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
else
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
@@ -296,7 +314,9 @@ convert_links (const char *file, struct urlpos *links)
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
- if (!link->link_refresh_p)
+ if (link->link_css_p)
+ p = replace_plain (p, link->size, fp, quoted_newlink);
+ else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newlink);
else
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
@@ -406,6 +426,7 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
size_t filename_len = strlen (file);
char* filename_plus_orig_suffix;
+ /* TODO: hack this to work with css files */
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
/* Just write "orig" over "html". We need to do it this way
@@ -465,6 +486,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
static bool find_fragment (const char *, int, const char **, const char **);
+/* Replace a string with NEW_TEXT. Ignore quoting. */
+static const char *
+replace_plain (const char *p, int size, FILE *fp, const char *new_text)
+{
+ fputs (new_text, fp);
+ p += size;
+ return p;
+}
+
/* Replace an attribute's original text with NEW_TEXT. */
static const char *
@@ -832,6 +862,16 @@ register_html (const char *url, const char *file)
string_set_add (downloaded_html_set, file);
}
+/* Register that FILE is a CSS file that has been downloaded. */
+
+void
+register_css (const char *url, const char *file)
+{
+ if (!downloaded_css_set)
+ downloaded_css_set = make_string_hash_table (0);
+ string_set_add (downloaded_css_set, file);
+}
+
static void downloaded_files_free (void);
/* Cleanup the data structures associated with this file. */
diff --git a/src/convert.h b/src/convert.h
index 0dd9d018..3d8b3059 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -33,6 +33,7 @@ as that of the covered work. */
struct hash_table; /* forward decl */
extern struct hash_table *dl_url_file_map;
extern struct hash_table *downloaded_html_set;
+extern struct hash_table *downloaded_css_set;
enum convert_options {
CO_NOCONVERT = 0, /* don't convert this URL */
@@ -64,7 +65,9 @@ struct urlpos {
unsigned int link_complete_p :1; /* the link was complete (had host name) */
unsigned int link_base_p :1; /* the url came from */
unsigned int link_inline_p :1; /* needed to render the page */
+ unsigned int link_css_p :1; /* the url came from CSS */
unsigned int link_expect_html :1; /* expected to contain HTML */
+ unsigned int link_expect_css :1; /* expected to contain CSS */
unsigned int link_refresh_p :1; /* link was received from
*/
@@ -98,6 +101,7 @@ downloaded_file_t downloaded_file (downloaded_file_t, const char *);
void register_download (const char *, const char *);
void register_redirection (const char *, const char *);
void register_html (const char *, const char *);
+void register_css (const char *, const char *);
void register_delete_file (const char *);
void convert_all_links (void);
void convert_cleanup (void);
diff --git a/src/css-tokens.h b/src/css-tokens.h
new file mode 100644
index 00000000..4feef42a
--- /dev/null
+++ b/src/css-tokens.h
@@ -0,0 +1,66 @@
+/* Declarations for css.lex
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#ifndef CSS_TOKENS_H
+#define CSS_TOKENS_H
+
+enum {
+ CSSEOF,
+ S,
+ CDO,
+ CDC,
+ INCLUDES,
+ DASHMATCH,
+ LBRACE,
+ PLUS,
+ GREATER,
+ COMMA,
+ STRING,
+ INVALID,
+ IDENT,
+ HASH,
+ IMPORT_SYM,
+ PAGE_SYM,
+ MEDIA_SYM,
+ CHARSET_SYM,
+ IMPORTANT_SYM,
+ EMS,
+ EXS,
+ LENGTH,
+ ANGLE,
+ TIME,
+ FREQ,
+ DIMENSION,
+ PERCENTAGE,
+ NUMBER,
+ URI,
+ FUNCTION
+} css_tokens;
+
+#endif /* CSS_TOKENS_H */
diff --git a/src/css-url.c b/src/css-url.c
new file mode 100644
index 00000000..42c8fc3e
--- /dev/null
+++ b/src/css-url.c
@@ -0,0 +1,273 @@
+/* Collect URLs from CSS source.
+ Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+/*
+ Note that this is not an actual CSS parser, but just a lexical
+ scanner with a tiny bit more smarts bolted on top. A full parser
+ is somewhat overkill for this job. The only things we're interested
+ in are @import rules and url() tokens, so it's easy enough to
+ grab those without truly understanding the input. The only downside
+ to this is that we might be coerced into downloading files that
+ a browser would ignore. That might merit some more investigation.
+ */
+
+#include
+
+#include
+#ifdef HAVE_STRING_H
+# include
+#else
+# include
+#endif
+#include
+#include
+#include
+#include
+
+#include "wget.h"
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len );
+extern int yylex (void);
+
+#if 1
+const char *token_names[] = {
+ "CSSEOF",
+ "S",
+ "CDO",
+ "CDC",
+ "INCLUDES",
+ "DASHMATCH",
+ "LBRACE",
+ "PLUS",
+ "GREATER",
+ "COMMA",
+ "STRING",
+ "INVALID",
+ "IDENT",
+ "HASH",
+ "IMPORT_SYM",
+ "PAGE_SYM",
+ "MEDIA_SYM",
+ "CHARSET_SYM",
+ "IMPORTANT_SYM",
+ "EMS",
+ "EXS",
+ "LENGTH",
+ "ANGLE",
+ "TIME",
+ "FREQ",
+ "DIMENSION",
+ "PERCENTAGE",
+ "NUMBER",
+ "URI",
+ "FUNCTION"
+};
+#endif
+
+/*
+ Given a detected URI token, get only the URI specified within.
+ Also adjust the starting position and length of the string.
+
+ A URI can be specified with or without quotes, and the quotes
+ can be single or double quotes. In addition there can be
+ whitespace after the opening parenthesis and before the closing
+ parenthesis.
+*/
+char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+ char *uri;
+ /*char buf[1024];
+ strncpy(buf,at + *pos, *length);
+ buf[*length] = '\0';
+ DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
+
+ if (0 != strncasecmp (at + *pos, "url(", 4))
+ return NULL;
+
+ *pos += 4;
+ *length -= 5; /* url() */
+ /* skip leading space */
+ while (isspace (at[*pos]))
+ {
+ (*pos)++;
+ (*length)--;
+ }
+ /* skip trailing space */
+ while (isspace (at[*pos + *length - 1]))
+ {
+ (*length)--;
+ }
+ /* trim off quotes */
+ if (at[*pos] == '\'' || at[*pos] == '"')
+ {
+ (*pos)++;
+ *length -= 2;
+ }
+
+ uri = xmalloc (*length + 1);
+ if (uri)
+ {
+ strncpy (uri, at + *pos, *length);
+ uri[*length] = '\0';
+ }
+
+ return uri;
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+ int token;
+ /*char tmp[2048];*/
+ int buffer_pos = 0;
+ int pos, length;
+ char *uri;
+
+ /*
+ strncpy(tmp,ctx->text + offset, buf_length);
+ tmp[buf_length] = '\0';
+ DEBUGP (("get_urls_css: \"%s\"\n", tmp));
+ */
+
+ /* tell flex to scan from this buffer */
+ yy_scan_bytes (ctx->text + offset, buf_length);
+
+ while((token = yylex()) != CSSEOF)
+ {
+ /*DEBUGP (("%s ", token_names[token]));*/
+ /* @import "foo.css"
+ or @import url(foo.css)
+ */
+ if(token == IMPORT_SYM)
+ {
+ do {
+ buffer_pos += yyleng;
+ } while((token = yylex()) == S);
+
+ /*DEBUGP (("%s ", token_names[token]));*/
+
+ if (token == STRING || token == URI)
+ {
+ /*DEBUGP (("Got URI "));*/
+ pos = buffer_pos + offset;
+ length = yyleng;
+
+ if (token == URI)
+ {
+ uri = get_uri_string (ctx->text, &pos, &length);
+ }
+ else
+ {
+ /* cut out quote characters */
+ pos++;
+ length -= 2;
+ uri = xmalloc (length + 1);
+ strncpy (uri, yytext + 1, length);
+ uri[length] = '\0';
+ }
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ up->link_expect_css = 1;
+ }
+
+ xfree(uri);
+ }
+ }
+ }
+ /* background-image: url(foo.png)
+ note that we don't care what
+ property this is actually on.
+ */
+ else if(token == URI)
+ {
+ pos = buffer_pos + offset;
+ length = yyleng;
+ uri = get_uri_string (ctx->text, &pos, &length);
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ }
+
+ xfree (uri);
+ }
+ }
+ buffer_pos += yyleng;
+ }
+ DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+ struct file_memory *fm;
+ struct map_context ctx;
+
+ /* Load the file. */
+ fm = read_file (file);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return NULL;
+ }
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+ ctx.text = fm->content;
+ ctx.head = ctx.tail = NULL;
+ ctx.base = NULL;
+ ctx.parent_base = url ? url : opt.base_href;
+ ctx.document_file = file;
+ ctx.nofollow = 0;
+
+ get_urls_css (&ctx, 0, fm->length);
+ read_file_free (fm);
+ return ctx.head;
+}
diff --git a/src/css-url.h b/src/css-url.h
new file mode 100644
index 00000000..772e2fd7
--- /dev/null
+++ b/src/css-url.h
@@ -0,0 +1,36 @@
+/* Declarations for css-url.c.
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#ifndef CSS_URL_H
+#define CSS_URL_H
+
+void get_urls_css (struct map_context *, int, int);
+struct urlpos *get_urls_css_file (const char *, const char *);
+
+#endif /* CSS_URL_H */
diff --git a/src/css.lex b/src/css.lex
new file mode 100644
index 00000000..8d1477a4
--- /dev/null
+++ b/src/css.lex
@@ -0,0 +1,137 @@
+%option case-insensitive
+%option noyywrap
+%option never-interactive
+
+%{
+/* Lex source for CSS tokenizing.
+ Taken from http://www.w3.org/TR/CSS21/grammar.html#q2
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#include "css-tokens.h"
+
+/* {s}+\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {unput(' '); } */
+/*replace by space*/
+%}
+
+h [0-9a-f]
+nonascii [\200-\377]
+unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])?
+escape {unicode}|\\[^\r\n\f0-9a-f]
+nmstart [_a-z]|{nonascii}|{escape}
+nmchar [_a-z0-9-]|{nonascii}|{escape}
+string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
+string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\'
+invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})*
+invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})*
+
+comment \/\*[^*]*\*+([^/*][^*]*\*+)*\/
+ident -?{nmstart}{nmchar}*
+name {nmchar}+
+num [0-9]+|[0-9]*"."[0-9]+
+string {string1}|{string2}
+invalid {invalid1}|{invalid2}
+url ([!#$%&*-~]|{nonascii}|{escape})*
+s [ \t\r\n\f]
+w ({s}|{comment})*
+nl \n|\r\n|\r|\f
+
+A a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])?
+C c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])?
+D d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])?
+E e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])?
+G g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g
+H h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h
+I i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i
+K k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k
+M m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m
+N n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n
+P p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p
+R r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r
+S s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s
+T t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t
+X x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x
+Z z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z
+
+%%
+
+{s} {return S;}
+
+\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {return S;} /* ignore comments */
+
+"" {return CDC;}
+"~=" {return INCLUDES;}
+"|=" {return DASHMATCH;}
+
+{w}"{" {return LBRACE;}
+{w}"+" {return PLUS;}
+{w}">" {return GREATER;}
+{w}"," {return COMMA;}
+
+{string} {return STRING;}
+{invalid} {return INVALID; /* unclosed string */}
+
+{ident} {return IDENT;}
+
+"#"{name} {return HASH;}
+
+"@import" {return IMPORT_SYM;}
+"@page" {return PAGE_SYM;}
+"@media" {return MEDIA_SYM;}
+"@charset " {return CHARSET_SYM;}
+
+"!"{w}"important" {return IMPORTANT_SYM;}
+
+{num}{E}{M} {return EMS;}
+{num}{E}{X} {return EXS;}
+{num}{P}{X} {return LENGTH;}
+{num}{C}{M} {return LENGTH;}
+{num}{M}{M} {return LENGTH;}
+{num}{I}{N} {return LENGTH;}
+{num}{P}{T} {return LENGTH;}
+{num}{P}{C} {return LENGTH;}
+{num}{D}{E}{G} {return ANGLE;}
+{num}{R}{A}{D} {return ANGLE;}
+{num}{G}{R}{A}{D} {return ANGLE;}
+{num}{M}{S} {return TIME;}
+{num}{S} {return TIME;}
+{num}{H}{Z} {return FREQ;}
+{num}{K}{H}{Z} {return FREQ;}
+{num}{ident} {return DIMENSION;}
+
+{num}% {return PERCENTAGE;}
+{num} {return NUMBER;}
+
+"url("{w}{string}{w}")" {return URI;}
+"url("{w}{url}{w}")" {return URI;}
+{ident}"(" {return FUNCTION;}
+
+. {return *yytext;}
+
+%%
diff --git a/src/html-parse.c b/src/html-parse.c
index ade82f2b..f744597b 100644
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -271,6 +271,94 @@ struct pool {
to "prev = ts->next = NULL;
+ }
+ else
+ {
+ (*tail)->next = ts;
+ ts->prev = *tail;
+ *tail = ts;
+ ts->next = NULL;
+ }
+
+ return ts;
+}
+
+/* remove ts and everything after it from the stack */
+void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+ struct tagstack_item *ts)
+{
+ if (*head == NULL)
+ return;
+
+ if (ts == *tail)
+ {
+ if (ts == *head)
+ {
+ xfree (ts);
+ *head = *tail = NULL;
+ }
+ else
+ {
+ ts->prev->next = NULL;
+ *tail = ts->prev;
+ xfree (ts);
+ }
+ }
+ else
+ {
+ if (ts == *head)
+ {
+ *head = NULL;
+ }
+ *tail = ts->prev;
+
+ if (ts->prev)
+ {
+ ts->prev->next = NULL;
+ }
+ while (ts)
+ {
+ struct tagstack_item *p = ts->next;
+ xfree (ts);
+ ts = p;
+ }
+ }
+}
+
+struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+ const char *tagname_end)
+{
+ int len = tagname_end - tagname_begin;
+ while (tail)
+ {
+ if (len == (tail->tagname_end - tail->tagname_begin))
+ {
+ if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+ return tail;
+ }
+ tail = tail->prev;
+ }
+ return NULL;
+}
+
/* Decode the HTML character entity at *PTR, considering END to be end
of buffer. It is assumed that the "&" character that marks the
beginning of the entity has been seen at *PTR-1. If a recognized
@@ -756,6 +844,9 @@ map_html_tags (const char *text, int size,
bool attr_pair_resized = false;
struct attr_pair *pairs = attr_pair_initial_storage;
+ struct tagstack_item *head = NULL;
+ struct tagstack_item *tail = NULL;
+
if (!size)
return;
@@ -822,6 +913,18 @@ map_html_tags (const char *text, int size,
goto look_for_tag;
tag_name_end = p;
SKIP_WS (p);
+
+ if (!end_tag)
+ {
+ struct tagstack_item *ts = tagstack_push (&head, &tail);
+ if (ts)
+ {
+ ts->tagname_begin = tag_name_begin;
+ ts->tagname_end = tag_name_end;
+ ts->contents_begin = NULL;
+ }
+ }
+
if (end_tag && *p != '>')
goto backout_tag;
@@ -983,6 +1086,11 @@ map_html_tags (const char *text, int size,
++nattrs;
}
+ if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+ {
+ tail->contents_begin = p+1;
+ }
+
if (uninteresting_tag)
{
ADVANCE (p);
@@ -994,6 +1102,7 @@ map_html_tags (const char *text, int size,
{
int i;
struct taginfo taginfo;
+ struct tagstack_item *ts = NULL;
taginfo.name = pool.contents;
taginfo.end_tag_p = end_tag;
@@ -1010,6 +1119,23 @@ map_html_tags (const char *text, int size,
taginfo.attrs = pairs;
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
+ taginfo.contents_begin = NULL;
+ taginfo.contents_end = NULL;
+
+ if (end_tag)
+ {
+ ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+ if (ts)
+ {
+ if (ts->contents_begin)
+ {
+ taginfo.contents_begin = ts->contents_begin;
+ taginfo.contents_end = tag_start_position;
+ }
+ tagstack_pop (&head, &tail, ts);
+ }
+ }
+
mapfun (&taginfo, maparg);
ADVANCE (p);
}
@@ -1029,6 +1155,8 @@ map_html_tags (const char *text, int size,
POOL_FREE (&pool);
if (attr_pair_resized)
xfree (pairs);
+ /* pop any tag stack that's left */
+ tagstack_pop (&head, &tail, head);
}
#undef ADVANCE
diff --git a/src/html-parse.h b/src/html-parse.h
index abe3b08d..ed1c6855 100644
--- a/src/html-parse.h
+++ b/src/html-parse.h
@@ -52,6 +52,9 @@ struct taginfo {
const char *start_position; /* start position of tag */
const char *end_position; /* end position of tag */
+
+ const char *contents_begin; /* delimiters of tag contents */
+ const char *contents_end; /* only valid if end_tag_p */
};
struct hash_table; /* forward declaration */
diff --git a/src/html-url.c b/src/html-url.c
index e9f2773a..c9cf28f6 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -41,9 +41,9 @@ as that of the covered work. */
#include "utils.h"
#include "hash.h"
#include "convert.h"
-#include "recur.h" /* declaration of get_urls_html */
-
-struct map_context;
+#include "recur.h"
+#include "html-url.h"
+#include "css-url.h"
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
@@ -163,11 +163,12 @@ static struct {
from the information above. However, some places in the code refer
to the attributes not mentioned here. We add them manually. */
static const char *additional_attributes[] = {
- "rel", /* used by tag_handle_link */
- "http-equiv", /* used by tag_handle_meta */
- "name", /* used by tag_handle_meta */
- "content", /* used by tag_handle_meta */
- "action" /* used by tag_handle_form */
+ "rel", /* used by tag_handle_link */
+ "http-equiv", /* used by tag_handle_meta */
+ "name", /* used by tag_handle_meta */
+ "content", /* used by tag_handle_meta */
+ "action", /* used by tag_handle_form */
+ "style" /* used by check_style_attr */
};
static struct hash_table *interesting_tags;
@@ -246,28 +247,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind)
return NULL;
}
-struct map_context {
- char *text; /* HTML text. */
- char *base; /* Base URI of the document, possibly
- changed through . */
- const char *parent_base; /* Base of the current document. */
- const char *document_file; /* File name of this document. */
- bool nofollow; /* whether NOFOLLOW was specified in a
- tag. */
-
- struct urlpos *head, *tail; /* List of URLs that is being
- built. */
-};
+/* used for calls to append_url */
+#define ATTR_POS(tag, attrind, ctx) \
+ (tag->attrs[attrind].value_raw_beginning - ctx->text)
+#define ATTR_SIZE(tag, attrind) \
+ (tag->attrs[attrind].value_raw_size)
/* Append LINK_URI to the urlpos structure that is being built.
- LINK_URI will be merged with the current document base. TAG and
- ATTRIND are the necessary context to store the position and
- size. */
+ LINK_URI will be merged with the current document base.
+*/
-static struct urlpos *
-append_url (const char *link_uri,
- struct taginfo *tag, int attrind, struct map_context *ctx)
+struct urlpos *
+append_url (const char *link_uri, int position, int size,
+ struct map_context *ctx)
{
int link_has_scheme = url_has_scheme (link_uri);
struct urlpos *newel;
@@ -325,8 +318,8 @@ append_url (const char *link_uri,
newel = xnew0 (struct urlpos);
newel->url = url;
- newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
- newel->size = tag->attrs[attrind].value_raw_size;
+ newel->pos = position;
+ newel->size = size;
/* A URL is relative if the host is not named, and the name does not
start with `/'. */
@@ -346,6 +339,18 @@ append_url (const char *link_uri,
return newel;
}
+static void
+check_style_attr (struct taginfo *tag, struct map_context *ctx)
+{
+ int attrind;
+ char *style = find_attr (tag, "style", &attrind);
+ if (!style)
+ return;
+
+ /* raw pos and raw size include the quotes, hence the +1 -2 */
+ get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
+}
+
/* All the tag_* functions are called from collect_tags_mapper, as
specified by KNOWN_TAGS. */
@@ -393,7 +398,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
if (0 == strcasecmp (tag->attrs[attrind].name,
tag_url_attributes[i].attr_name))
{
- struct urlpos *up = append_url (link, tag, attrind, ctx);
+ struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (up)
{
int flags = tag_url_attributes[i].flags;
@@ -418,7 +424,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
if (!newbase)
return;
- base_urlpos = append_url (newbase, tag, attrind, ctx);
+ base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (!base_urlpos)
return;
base_urlpos->ignore_when_downloading = 1;
@@ -439,9 +446,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
{
int attrind;
char *action = find_attr (tag, "action", &attrind);
+
if (action)
{
- struct urlpos *up = append_url (action, tag, attrind, ctx);
+ struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (up)
up->ignore_when_downloading = 1;
}
@@ -464,14 +473,23 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
*/
if (href)
{
- struct urlpos *up = append_url (href, tag, attrind, ctx);
+ struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (up)
{
char *rel = find_attr (tag, "rel", NULL);
- if (rel
- && (0 == strcasecmp (rel, "stylesheet")
- || 0 == strcasecmp (rel, "shortcut icon")))
- up->link_inline_p = 1;
+ if (rel)
+ {
+ if (0 == strcasecmp (rel, "stylesheet"))
+ {
+ up->link_inline_p = 1;
+ up->link_expect_css = 1;
+ }
+ else if (0 == strcasecmp (rel, "shortcut icon"))
+ {
+ up->link_inline_p = 1;
+ }
+ }
else
/* The external ones usually point to HTML pages, such as
*/
@@ -525,7 +543,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
while (c_isspace (*p))
++p;
- entry = append_url (p, tag, attrind, ctx);
+ entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (entry)
{
entry->link_refresh_p = 1;
@@ -570,11 +589,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
struct map_context *ctx = (struct map_context *)arg;
/* Find the tag in our table of tags. This must not fail because
- map_html_tags only returns tags found in interesting_tags. */
+ map_html_tags only returns tags found in interesting_tags.
+
+ I've changed this for now, I'm passing NULL as interesting_tags
+ to map_html_tags. This way we can check all tags for a style
+ attribute.
+ */
struct known_tag *t = hash_table_get (interesting_tags, tag->name);
- assert (t != NULL);
- t->handler (t->tagid, tag, ctx);
+ if (t != NULL)
+ t->handler (t->tagid, tag, ctx);
+
+ check_style_attr (tag, ctx);
+
+ if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
+ tag->contents_begin && tag->contents_end)
+ {
+ /* parse contents */
+ get_urls_css (ctx, tag->contents_begin - ctx->text,
+ tag->contents_end - tag->contents_begin);
+ }
}
/* Analyze HTML tags FILE and construct a list of URLs referenced from
@@ -618,8 +652,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
+ /* the NULL here used to be interesting_tags */
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
- interesting_tags, interesting_attributes);
+ NULL, interesting_attributes);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
diff --git a/src/html-url.h b/src/html-url.h
new file mode 100644
index 00000000..a94f0db6
--- /dev/null
+++ b/src/html-url.h
@@ -0,0 +1,51 @@
+/* Declarations for html-url.c.
+ Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+#ifndef HTML_URL_H
+#define HTML_URL_H
+
+struct map_context {
+ char *text; /* HTML text. */
+ char *base; /* Base URI of the document, possibly
+ changed through . */
+ const char *parent_base; /* Base of the current document. */
+ const char *document_file; /* File name of this document. */
+ bool nofollow; /* whether NOFOLLOW was specified in a
+ tag. */
+
+ struct urlpos *head, *tail; /* List of URLs that is being
+ built. */
+};
+
+struct urlpos *get_urls_file (const char *);
+struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *append_url (const char *, int, int, struct map_context *);
+void free_urlpos (struct urlpos *);
+
+#endif /* HTML_URL_H */
diff --git a/src/http.c b/src/http.c
index ec815c8f..fb8184f1 100644
--- a/src/http.c
+++ b/src/http.c
@@ -74,6 +74,7 @@ static char *create_authorization_line (const char *, const char *,
const char *, bool *);
static char *basic_authentication_encode (const char *, const char *);
static bool known_authentication_scheme_p (const char *, const char *);
+static void ensure_extension (struct http_stat *, const char *, int *);
static void load_cookies (void);
#ifndef MIN
@@ -86,6 +87,7 @@ static struct cookie_jar *wget_cookie_jar;
#define TEXTHTML_S "text/html"
#define TEXTXHTML_S "application/xhtml+xml"
+#define TEXTCSS_S "text/css"
/* Some status code validation macros: */
#define H_20X(x) (((x) >= 200) && ((x) < 300))
@@ -2114,34 +2116,25 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
else
*dt &= ~TEXTHTML;
- if (opt.html_extension && (*dt & TEXTHTML))
- /* -E / --html-extension / html_extension = on was specified, and this is a
- text/html file. If some case-insensitive variation on ".htm[l]" isn't
- already the file's suffix, tack on ".html". */
- {
- char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+ if (type &&
+ 0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S)))
+ *dt |= TEXTCSS;
+ else
+ *dt &= ~TEXTCSS;
- if (last_period_in_local_filename == NULL
- || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
- || 0 == strcasecmp (last_period_in_local_filename, ".html")))
+ if (opt.html_extension)
+ {
+ if (*dt & TEXTHTML)
+ /* -E / --html-extension / html_extension = on was specified,
+ and this is a text/html file. If some case-insensitive
+ variation on ".htm[l]" isn't already the file's suffix,
+ tack on ".html". */
{
- int local_filename_len = strlen (hs->local_file);
- /* Resize the local file, allowing for ".html" preceded by
- optional ".NUMBER". */
- hs->local_file = xrealloc (hs->local_file,
- local_filename_len + 24 + sizeof (".html"));
- strcpy(hs->local_file + local_filename_len, ".html");
- /* If clobbering is not allowed and the file, as named,
- exists, tack on ".NUMBER.html" instead. */
- if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
- {
- int ext_num = 1;
- do
- sprintf (hs->local_file + local_filename_len,
- ".%d.html", ext_num++);
- while (file_exists_p (hs->local_file));
- }
- *dt |= ADDED_HTML_EXTENSION;
+ ensure_extension (hs, ".html", dt);
+ }
+ else if (*dt & TEXTCSS)
+ {
+ ensure_extension (hs, ".css", dt);
}
}
@@ -3181,6 +3174,42 @@ http_cleanup (void)
cookie_jar_delete (wget_cookie_jar);
}
+void
+ensure_extension (struct http_stat *hs, const char *ext, int *dt)
+{
+ char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+ char shortext[8];
+ int len = strlen (ext);
+ if (len == 5)
+ {
+ strncpy (shortext, ext, len - 1);
+ shortext[len - 2] = '\0';
+ }
+
+ if (last_period_in_local_filename == NULL
+ || !(0 == strcasecmp (last_period_in_local_filename, shortext)
+ || 0 == strcasecmp (last_period_in_local_filename, ext)))
+ {
+ int local_filename_len = strlen (hs->local_file);
+ /* Resize the local file, allowing for ".html" preceded by
+ optional ".NUMBER". */
+ hs->local_file = xrealloc (hs->local_file,
+ local_filename_len + 24 + len);
+ strcpy (hs->local_file + local_filename_len, ext);
+ /* If clobbering is not allowed and the file, as named,
+ exists, tack on ".NUMBER.html" instead. */
+ if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
+ {
+ int ext_num = 1;
+ do
+ sprintf (hs->local_file + local_filename_len,
+ ".%d%s", ext_num++, ext);
+ while (file_exists_p (hs->local_file));
+ }
+ *dt |= ADDED_HTML_EXTENSION;
+ }
+}
+
#ifdef TESTING
diff --git a/src/recur.c b/src/recur.c
index c11cfdad..daf8a374 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -48,17 +48,19 @@ as that of the covered work. */
#include "hash.h"
#include "res.h"
#include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
#include "spider.h"
-
+
/* Functions for maintaining the URL queue. */
struct queue_element {
- const char *url; /* the URL to download */
const char *referer; /* the referring document */
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ bool css_allowed; /* whether the document is allowed to
+ be treated as CSS. */
struct queue_element *next; /* next element in queue */
};
@@ -91,13 +93,15 @@ url_queue_delete (struct url_queue *queue)
static void
url_enqueue (struct url_queue *queue,
- const char *url, const char *referer, int depth, bool html_allowed)
+ const char *url, const char *referer, int depth,
+ bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
+ qel->css_allowed = css_allowed;
qel->next = NULL;
++queue->count;
@@ -121,7 +125,7 @@ url_enqueue (struct url_queue *queue,
static bool
url_dequeue (struct url_queue *queue,
const char **url, const char **referer, int *depth,
- bool *html_allowed)
+ bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
@@ -136,6 +140,7 @@ url_dequeue (struct url_queue *queue,
*referer = qel->referer;
*depth = qel->depth;
*html_allowed = qel->html_allowed;
+ *css_allowed = qel->css_allowed;
--queue->count;
@@ -200,7 +205,7 @@ retrieve_tree (const char *start_url)
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+ url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
@@ -208,7 +213,8 @@ retrieve_tree (const char *start_url)
bool descend = false;
char *url, *referer, *file = NULL;
int depth;
- bool html_allowed;
+ bool html_allowed, css_allowed;
+ bool is_css = false;
bool dash_p_leaf_HTML = false;
if (opt.quota && total_downloaded_bytes > opt.quota)
@@ -220,7 +226,7 @@ retrieve_tree (const char *start_url)
if (!url_dequeue (queue,
(const char **)&url, (const char **)&referer,
- &depth, &html_allowed))
+ &depth, &html_allowed, &css_allowed))
break;
/* ...and download it. Note that this download is in most cases
@@ -238,10 +244,21 @@ retrieve_tree (const char *start_url)
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
url, file));
+ /* this sucks, needs to be combined! */
if (html_allowed
&& downloaded_html_set
&& string_set_contains (downloaded_html_set, file))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
+ descend = true;
+ is_css = true;
+ }
}
else
{
@@ -252,7 +269,21 @@ retrieve_tree (const char *start_url)
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = false;
+ }
if (redirected)
{
@@ -306,14 +337,15 @@ retrieve_tree (const char *start_url)
}
}
- /* If the downloaded document was HTML, parse it and enqueue the
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
links it contains. */
if (descend)
{
bool meta_disallow_follow = false;
struct urlpos *children
- = get_urls_html (file, url, &meta_disallow_follow);
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
@@ -345,7 +377,8 @@ retrieve_tree (const char *start_url)
{
url_enqueue (queue, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
- child->link_expect_html);
+ child->link_expect_html,
+ child->link_expect_css);
/* We blacklist the URL we have enqueued, because we
don't want to enqueue (and hence download) the
same URL twice. */
@@ -394,9 +427,9 @@ retrieve_tree (const char *start_url)
{
char *d1, *d2;
int d3;
- bool d4;
+ bool d4, d5;
while (url_dequeue (queue,
- (const char **)&d1, (const char **)&d2, &d3, &d4))
+ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
xfree (d1);
xfree_null (d2);
diff --git a/src/recur.h b/src/recur.h
index d2c8e614..5ab26a95 100644
--- a/src/recur.h
+++ b/src/recur.h
@@ -44,9 +44,4 @@ struct urlpos;
void recursive_cleanup (void);
uerr_t retrieve_tree (const char *);
-/* These are really in html-url.c. */
-struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
-void free_urlpos (struct urlpos *);
-
#endif /* RECUR_H */
diff --git a/src/retr.c b/src/retr.c
index 179430ac..7bdd4193 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -51,6 +51,7 @@ as that of the covered work. */
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "html-url.h"
/* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes;
@@ -778,6 +779,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
+ if (*dt & TEXTCSS)
+ register_css (u->url, local_file);
}
}
diff --git a/src/wget.h b/src/wget.h
index 08d8d837..5b0df1a8 100644
--- a/src/wget.h
+++ b/src/wget.h
@@ -312,7 +312,8 @@ enum
HEAD_ONLY = 0x0004, /* only send the HEAD request */
SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */
ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */
- ADDED_HTML_EXTENSION = 0x0020 /* added ".html" extension due to -E */
+ ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */
+ TEXTCSS = 0x0040 /* document is of type text/css */
};
/* Universal error type -- used almost everywhere. Error reporting of