mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Ted Mielczarek's CSS wonder-patch, applied against the source from around the time the patch was written.
This commit is contained in:
parent
3f51773542
commit
a0d0f332d5
@ -115,6 +115,9 @@ test -z "$CC" && cc_specified=yes
|
||||
AC_PROG_CC
|
||||
AC_AIX
|
||||
|
||||
YYTEXT_POINTER=1
|
||||
AC_PROG_LEX
|
||||
|
||||
dnl Turn on optimization by default. Specifically:
|
||||
dnl
|
||||
dnl if the user hasn't specified CFLAGS, then
|
||||
|
@ -54,6 +54,7 @@ CFLAGS = @CFLAGS@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBS = @LIBS@ @LIBSSL@ @LIBGNUTLS@
|
||||
exeext = @exeext@
|
||||
LEX = @LEX@
|
||||
|
||||
INCLUDES = -I. -I$(srcdir)
|
||||
|
||||
@ -72,12 +73,12 @@ NTLM_OBJ = @NTLM_OBJ@
|
||||
SSL_OBJ = @SSL_OBJ@
|
||||
GETOPT_OBJ = @GETOPT_OBJ@
|
||||
|
||||
OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \
|
||||
OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o css-url.o \
|
||||
ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
|
||||
host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \
|
||||
log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
|
||||
res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \
|
||||
url.o utils.o version.o xmalloc.o
|
||||
lex.yy.o log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o \
|
||||
recur.o res.o retr.o safe-ctype.o snprintf.o spider.o \
|
||||
$(SSL_OBJ) url.o utils.o version.o xmalloc.o
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .o
|
||||
@ -90,14 +91,17 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \
|
||||
wget$(exeext): $(OBJ)
|
||||
$(LINK) $(OBJ) $(LIBS)
|
||||
|
||||
lex.yy.c: css.lex
|
||||
$(LEX) $<
|
||||
|
||||
# We make object files depend on every header. Rather than attempt to
|
||||
# track dependencies, everything gets recompiled when a header
|
||||
# changes. With a program of Wget's size this doesn't waste much
|
||||
# time, and it's a lot safer than attempting to get all the
|
||||
# dependencies right.
|
||||
|
||||
$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
|
||||
gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
|
||||
$(OBJ): config-post.h config.h connect.h convert.h cookies.h css-url.h \
|
||||
ftp.h gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \
|
||||
http-ntlm.h init.h log.h mswindows.h netrc.h options.h \
|
||||
progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \
|
||||
spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
|
||||
@ -122,7 +126,7 @@ uninstall.bin:
|
||||
#
|
||||
|
||||
clean:
|
||||
$(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]*
|
||||
$(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]* lex.yy.c
|
||||
|
||||
distclean: clean
|
||||
$(RM) Makefile config.h
|
||||
|
100
src/convert.c
100
src/convert.c
@ -46,50 +46,37 @@ so, delete this exception statement from your version. */
|
||||
#include "hash.h"
|
||||
#include "ptimer.h"
|
||||
#include "res.h"
|
||||
#include "html-url.h"
|
||||
#include "css-url.h"
|
||||
|
||||
static struct hash_table *dl_file_url_map;
|
||||
struct hash_table *dl_url_file_map;
|
||||
|
||||
/* Set of HTML files downloaded in this Wget run, used for link
|
||||
/* Set of HTML/CSS files downloaded in this Wget run, used for link
|
||||
conversion after Wget is done. */
|
||||
struct hash_table *downloaded_html_set;
|
||||
struct hash_table *downloaded_css_set;
|
||||
|
||||
static void convert_links (const char *, struct urlpos *);
|
||||
|
||||
/* This function is called when the retrieval is done to convert the
|
||||
links that have been downloaded. It has to be called at the end of
|
||||
the retrieval, because only then does Wget know conclusively which
|
||||
URLs have been downloaded, and which not, so it can tell which
|
||||
direction to convert to.
|
||||
|
||||
The "direction" means that the URLs to the files that have been
|
||||
downloaded get converted to the relative URL which will point to
|
||||
that file. And the other URLs get converted to the remote URL on
|
||||
the server.
|
||||
|
||||
All the downloaded HTMLs are kept in downloaded_html_files, and
|
||||
downloaded URLs in urls_downloaded. All the information is
|
||||
extracted from these two lists. */
|
||||
|
||||
void
|
||||
convert_all_links (void)
|
||||
convert_links_in_hashtable (struct hash_table *downloaded_set,
|
||||
int is_css,
|
||||
int *file_count)
|
||||
{
|
||||
int i;
|
||||
double secs;
|
||||
int file_count = 0;
|
||||
|
||||
struct ptimer *timer = ptimer_new ();
|
||||
|
||||
int cnt;
|
||||
char **file_array;
|
||||
|
||||
cnt = 0;
|
||||
if (downloaded_html_set)
|
||||
cnt = hash_table_count (downloaded_html_set);
|
||||
if (downloaded_set)
|
||||
cnt = hash_table_count (downloaded_set);
|
||||
if (cnt == 0)
|
||||
return;
|
||||
file_array = alloca_array (char *, cnt);
|
||||
string_set_to_array (downloaded_html_set, file_array);
|
||||
string_set_to_array (downloaded_set, file_array);
|
||||
|
||||
for (i = 0; i < cnt; i++)
|
||||
{
|
||||
@ -97,7 +84,7 @@ convert_all_links (void)
|
||||
char *url;
|
||||
char *file = file_array[i];
|
||||
|
||||
/* Determine the URL of the HTML file. get_urls_html will need
|
||||
/* Determine the URL of the file. get_urls_{html,css} will need
|
||||
it. */
|
||||
url = hash_table_get (dl_file_url_map, file);
|
||||
if (!url)
|
||||
@ -108,8 +95,9 @@ convert_all_links (void)
|
||||
|
||||
DEBUGP (("Scanning %s (from %s)\n", file, url));
|
||||
|
||||
/* Parse the HTML file... */
|
||||
urls = get_urls_html (file, url, NULL);
|
||||
/* Parse the file... */
|
||||
urls = is_css ? get_urls_css_file (file, url) :
|
||||
get_urls_html (file, url, NULL);
|
||||
|
||||
/* We don't respect meta_disallow_follow here because, even if
|
||||
the file is not followed, we might still want to convert the
|
||||
@ -161,11 +149,38 @@ convert_all_links (void)
|
||||
|
||||
/* Convert the links in the file. */
|
||||
convert_links (file, urls);
|
||||
++file_count;
|
||||
++*file_count;
|
||||
|
||||
/* Free the data. */
|
||||
free_urlpos (urls);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function is called when the retrieval is done to convert the
|
||||
links that have been downloaded. It has to be called at the end of
|
||||
the retrieval, because only then does Wget know conclusively which
|
||||
URLs have been downloaded, and which not, so it can tell which
|
||||
direction to convert to.
|
||||
|
||||
The "direction" means that the URLs to the files that have been
|
||||
downloaded get converted to the relative URL which will point to
|
||||
that file. And the other URLs get converted to the remote URL on
|
||||
the server.
|
||||
|
||||
All the downloaded HTMLs are kept in downloaded_html_files, and
|
||||
downloaded URLs in urls_downloaded. All the information is
|
||||
extracted from these two lists. */
|
||||
|
||||
void
|
||||
convert_all_links (void)
|
||||
{
|
||||
double secs;
|
||||
int file_count = 0;
|
||||
|
||||
struct ptimer *timer = ptimer_new ();
|
||||
|
||||
convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
|
||||
convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
|
||||
|
||||
secs = ptimer_measure (timer);
|
||||
ptimer_destroy (timer);
|
||||
@ -174,13 +189,14 @@ convert_all_links (void)
|
||||
}
|
||||
|
||||
static void write_backup_file (const char *, downloaded_file_t);
|
||||
static const char *replace_plain (const char*, int, FILE*, const char *);
|
||||
static const char *replace_attr (const char *, int, FILE *, const char *);
|
||||
static const char *replace_attr_refresh_hack (const char *, int, FILE *,
|
||||
const char *, int);
|
||||
static char *local_quote_string (const char *);
|
||||
static char *construct_relative (const char *, const char *);
|
||||
|
||||
/* Change the links in one HTML file. LINKS is a list of links in the
|
||||
/* Change the links in one file. LINKS is a list of links in the
|
||||
document, along with their positions and the desired direction of
|
||||
the conversion. */
|
||||
static void
|
||||
@ -277,7 +293,9 @@ convert_links (const char *file, struct urlpos *links)
|
||||
char *newname = construct_relative (file, link->local_name);
|
||||
char *quoted_newname = local_quote_string (newname);
|
||||
|
||||
if (!link->link_refresh_p)
|
||||
if (link->link_css_p)
|
||||
p = replace_plain (p, link->size, fp, quoted_newname);
|
||||
else if (!link->link_refresh_p)
|
||||
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||
else
|
||||
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
|
||||
@ -296,7 +314,9 @@ convert_links (const char *file, struct urlpos *links)
|
||||
char *newlink = link->url->url;
|
||||
char *quoted_newlink = html_quote_string (newlink);
|
||||
|
||||
if (!link->link_refresh_p)
|
||||
if (link->link_css_p)
|
||||
p = replace_plain (p, link->size, fp, quoted_newlink);
|
||||
else if (!link->link_refresh_p)
|
||||
p = replace_attr (p, link->size, fp, quoted_newlink);
|
||||
else
|
||||
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
|
||||
@ -406,6 +426,7 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
||||
size_t filename_len = strlen (file);
|
||||
char* filename_plus_orig_suffix;
|
||||
|
||||
/* TODO: hack this to work with css files */
|
||||
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
|
||||
{
|
||||
/* Just write "orig" over "html". We need to do it this way
|
||||
@ -465,6 +486,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
|
||||
|
||||
static bool find_fragment (const char *, int, const char **, const char **);
|
||||
|
||||
/* Replace a string with NEW_TEXT. Ignore quoting. */
|
||||
static const char *
|
||||
replace_plain (const char *p, int size, FILE *fp, const char *new_text)
|
||||
{
|
||||
fputs (new_text, fp);
|
||||
p += size;
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Replace an attribute's original text with NEW_TEXT. */
|
||||
|
||||
static const char *
|
||||
@ -832,6 +862,16 @@ register_html (const char *url, const char *file)
|
||||
string_set_add (downloaded_html_set, file);
|
||||
}
|
||||
|
||||
/* Register that FILE is a CSS file that has been downloaded. */
|
||||
|
||||
void
|
||||
register_css (const char *url, const char *file)
|
||||
{
|
||||
if (!downloaded_css_set)
|
||||
downloaded_css_set = make_string_hash_table (0);
|
||||
string_set_add (downloaded_css_set, file);
|
||||
}
|
||||
|
||||
static void downloaded_files_free (void);
|
||||
|
||||
/* Cleanup the data structures associated with this file. */
|
||||
|
@ -33,6 +33,7 @@ so, delete this exception statement from your version. */
|
||||
struct hash_table; /* forward decl */
|
||||
extern struct hash_table *dl_url_file_map;
|
||||
extern struct hash_table *downloaded_html_set;
|
||||
extern struct hash_table *downloaded_css_set;
|
||||
|
||||
enum convert_options {
|
||||
CO_NOCONVERT = 0, /* don't convert this URL */
|
||||
@ -64,7 +65,9 @@ struct urlpos {
|
||||
unsigned int link_complete_p :1; /* the link was complete (had host name) */
|
||||
unsigned int link_base_p :1; /* the url came from <base href=...> */
|
||||
unsigned int link_inline_p :1; /* needed to render the page */
|
||||
unsigned int link_css_p :1; /* the url came from CSS */
|
||||
unsigned int link_expect_html :1; /* expected to contain HTML */
|
||||
unsigned int link_expect_css :1; /* expected to contain CSS */
|
||||
|
||||
unsigned int link_refresh_p :1; /* link was received from
|
||||
<meta http-equiv=refresh content=...> */
|
||||
@ -98,6 +101,7 @@ downloaded_file_t downloaded_file (downloaded_file_t, const char *);
|
||||
void register_download (const char *, const char *);
|
||||
void register_redirection (const char *, const char *);
|
||||
void register_html (const char *, const char *);
|
||||
void register_css (const char *, const char *);
|
||||
void register_delete_file (const char *);
|
||||
void convert_all_links (void);
|
||||
void convert_cleanup (void);
|
||||
|
66
src/css-tokens.h
Normal file
66
src/css-tokens.h
Normal file
@ -0,0 +1,66 @@
|
||||
/* Declarations for css.lex
|
||||
Copyright (C) 2006 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Wget; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
In addition, as a special exception, the Free Software Foundation
|
||||
gives permission to link the code of its release of Wget with the
|
||||
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||
that use the same license as the "OpenSSL" library), and distribute
|
||||
the linked executables. You must obey the GNU General Public License
|
||||
in all respects for all of the code used other than "OpenSSL". If you
|
||||
modify this file, you may extend this exception to your version of the
|
||||
file, but you are not obligated to do so. If you do not wish to do
|
||||
so, delete this exception statement from your version. */
|
||||
|
||||
#ifndef CSS_TOKENS_H
|
||||
#define CSS_TOKENS_H
|
||||
|
||||
enum {
|
||||
CSSEOF,
|
||||
S,
|
||||
CDO,
|
||||
CDC,
|
||||
INCLUDES,
|
||||
DASHMATCH,
|
||||
LBRACE,
|
||||
PLUS,
|
||||
GREATER,
|
||||
COMMA,
|
||||
STRING,
|
||||
INVALID,
|
||||
IDENT,
|
||||
HASH,
|
||||
IMPORT_SYM,
|
||||
PAGE_SYM,
|
||||
MEDIA_SYM,
|
||||
CHARSET_SYM,
|
||||
IMPORTANT_SYM,
|
||||
EMS,
|
||||
EXS,
|
||||
LENGTH,
|
||||
ANGLE,
|
||||
TIME,
|
||||
FREQ,
|
||||
DIMENSION,
|
||||
PERCENTAGE,
|
||||
NUMBER,
|
||||
URI,
|
||||
FUNCTION
|
||||
} css_tokens;
|
||||
|
||||
#endif /* CSS_TOKENS_H */
|
273
src/css-url.c
Normal file
273
src/css-url.c
Normal file
@ -0,0 +1,273 @@
|
||||
/* Collect URLs from CSS source.
|
||||
Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Wget; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
In addition, as a special exception, the Free Software Foundation
|
||||
gives permission to link the code of its release of Wget with the
|
||||
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||
that use the same license as the "OpenSSL" library), and distribute
|
||||
the linked executables. You must obey the GNU General Public License
|
||||
in all respects for all of the code used other than "OpenSSL". If you
|
||||
modify this file, you may extend this exception to your version of the
|
||||
file, but you are not obligated to do so. If you do not wish to do
|
||||
so, delete this exception statement from your version. */
|
||||
|
||||
/*
|
||||
Note that this is not an actual CSS parser, but just a lexical
|
||||
scanner with a tiny bit more smarts bolted on top. A full parser
|
||||
is somewhat overkill for this job. The only things we're interested
|
||||
in are @import rules and url() tokens, so it's easy enough to
|
||||
grab those without truly understanding the input. The only downside
|
||||
to this is that we might be coerced into downloading files that
|
||||
a browser would ignore. That might merit some more investigation.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#else
|
||||
# include <strings.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "utils.h"
|
||||
#include "convert.h"
|
||||
#include "html-url.h"
|
||||
#include "css-tokens.h"
|
||||
|
||||
/* from lex.yy.c */
|
||||
extern char *yytext;
|
||||
extern int yyleng;
|
||||
typedef struct yy_buffer_state *YY_BUFFER_STATE;
|
||||
extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len );
|
||||
extern int yylex (void);
|
||||
|
||||
#if 1
|
||||
const char *token_names[] = {
|
||||
"CSSEOF",
|
||||
"S",
|
||||
"CDO",
|
||||
"CDC",
|
||||
"INCLUDES",
|
||||
"DASHMATCH",
|
||||
"LBRACE",
|
||||
"PLUS",
|
||||
"GREATER",
|
||||
"COMMA",
|
||||
"STRING",
|
||||
"INVALID",
|
||||
"IDENT",
|
||||
"HASH",
|
||||
"IMPORT_SYM",
|
||||
"PAGE_SYM",
|
||||
"MEDIA_SYM",
|
||||
"CHARSET_SYM",
|
||||
"IMPORTANT_SYM",
|
||||
"EMS",
|
||||
"EXS",
|
||||
"LENGTH",
|
||||
"ANGLE",
|
||||
"TIME",
|
||||
"FREQ",
|
||||
"DIMENSION",
|
||||
"PERCENTAGE",
|
||||
"NUMBER",
|
||||
"URI",
|
||||
"FUNCTION"
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
Given a detected URI token, get only the URI specified within.
|
||||
Also adjust the starting position and length of the string.
|
||||
|
||||
A URI can be specified with or without quotes, and the quotes
|
||||
can be single or double quotes. In addition there can be
|
||||
whitespace after the opening parenthesis and before the closing
|
||||
parenthesis.
|
||||
*/
|
||||
char *
|
||||
get_uri_string (const char *at, int *pos, int *length)
|
||||
{
|
||||
char *uri;
|
||||
/*char buf[1024];
|
||||
strncpy(buf,at + *pos, *length);
|
||||
buf[*length] = '\0';
|
||||
DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
|
||||
|
||||
if (0 != strncasecmp (at + *pos, "url(", 4))
|
||||
return NULL;
|
||||
|
||||
*pos += 4;
|
||||
*length -= 5; /* url() */
|
||||
/* skip leading space */
|
||||
while (isspace (at[*pos]))
|
||||
{
|
||||
(*pos)++;
|
||||
(*length)--;
|
||||
}
|
||||
/* skip trailing space */
|
||||
while (isspace (at[*pos + *length - 1]))
|
||||
{
|
||||
(*length)--;
|
||||
}
|
||||
/* trim off quotes */
|
||||
if (at[*pos] == '\'' || at[*pos] == '"')
|
||||
{
|
||||
(*pos)++;
|
||||
*length -= 2;
|
||||
}
|
||||
|
||||
uri = xmalloc (*length + 1);
|
||||
if (uri)
|
||||
{
|
||||
strncpy (uri, at + *pos, *length);
|
||||
uri[*length] = '\0';
|
||||
}
|
||||
|
||||
return uri;
|
||||
}
|
||||
|
||||
void
|
||||
get_urls_css (struct map_context *ctx, int offset, int buf_length)
|
||||
{
|
||||
int token;
|
||||
/*char tmp[2048];*/
|
||||
int buffer_pos = 0;
|
||||
int pos, length;
|
||||
char *uri;
|
||||
|
||||
/*
|
||||
strncpy(tmp,ctx->text + offset, buf_length);
|
||||
tmp[buf_length] = '\0';
|
||||
DEBUGP (("get_urls_css: \"%s\"\n", tmp));
|
||||
*/
|
||||
|
||||
/* tell flex to scan from this buffer */
|
||||
yy_scan_bytes (ctx->text + offset, buf_length);
|
||||
|
||||
while((token = yylex()) != CSSEOF)
|
||||
{
|
||||
/*DEBUGP (("%s ", token_names[token]));*/
|
||||
/* @import "foo.css"
|
||||
or @import url(foo.css)
|
||||
*/
|
||||
if(token == IMPORT_SYM)
|
||||
{
|
||||
do {
|
||||
buffer_pos += yyleng;
|
||||
} while((token = yylex()) == S);
|
||||
|
||||
/*DEBUGP (("%s ", token_names[token]));*/
|
||||
|
||||
if (token == STRING || token == URI)
|
||||
{
|
||||
/*DEBUGP (("Got URI "));*/
|
||||
pos = buffer_pos + offset;
|
||||
length = yyleng;
|
||||
|
||||
if (token == URI)
|
||||
{
|
||||
uri = get_uri_string (ctx->text, &pos, &length);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* cut out quote characters */
|
||||
pos++;
|
||||
length -= 2;
|
||||
uri = xmalloc (length + 1);
|
||||
strncpy (uri, yytext + 1, length);
|
||||
uri[length] = '\0';
|
||||
}
|
||||
|
||||
if (uri)
|
||||
{
|
||||
struct urlpos *up = append_url (uri, pos, length, ctx);
|
||||
DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
|
||||
|
||||
if (up)
|
||||
{
|
||||
up->link_inline_p = 1;
|
||||
up->link_css_p = 1;
|
||||
up->link_expect_css = 1;
|
||||
}
|
||||
|
||||
xfree(uri);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* background-image: url(foo.png)
|
||||
note that we don't care what
|
||||
property this is actually on.
|
||||
*/
|
||||
else if(token == URI)
|
||||
{
|
||||
pos = buffer_pos + offset;
|
||||
length = yyleng;
|
||||
uri = get_uri_string (ctx->text, &pos, &length);
|
||||
|
||||
if (uri)
|
||||
{
|
||||
struct urlpos *up = append_url (uri, pos, length, ctx);
|
||||
DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
|
||||
if (up)
|
||||
{
|
||||
up->link_inline_p = 1;
|
||||
up->link_css_p = 1;
|
||||
}
|
||||
|
||||
xfree (uri);
|
||||
}
|
||||
}
|
||||
buffer_pos += yyleng;
|
||||
}
|
||||
DEBUGP (("\n"));
|
||||
}
|
||||
|
||||
struct urlpos *
|
||||
get_urls_css_file (const char *file, const char *url)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
struct map_context ctx;
|
||||
|
||||
/* Load the file. */
|
||||
fm = read_file (file);
|
||||
if (!fm)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
||||
|
||||
ctx.text = fm->content;
|
||||
ctx.head = ctx.tail = NULL;
|
||||
ctx.base = NULL;
|
||||
ctx.parent_base = url ? url : opt.base_href;
|
||||
ctx.document_file = file;
|
||||
ctx.nofollow = 0;
|
||||
|
||||
get_urls_css (&ctx, 0, fm->length);
|
||||
read_file_free (fm);
|
||||
return ctx.head;
|
||||
}
|
36
src/css-url.h
Normal file
36
src/css-url.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* Declarations for css-url.c.
|
||||
Copyright (C) 2006 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Wget; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
In addition, as a special exception, the Free Software Foundation
|
||||
gives permission to link the code of its release of Wget with the
|
||||
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||
that use the same license as the "OpenSSL" library), and distribute
|
||||
the linked executables. You must obey the GNU General Public License
|
||||
in all respects for all of the code used other than "OpenSSL". If you
|
||||
modify this file, you may extend this exception to your version of the
|
||||
file, but you are not obligated to do so. If you do not wish to do
|
||||
so, delete this exception statement from your version. */
|
||||
|
||||
#ifndef CSS_URL_H
|
||||
#define CSS_URL_H
|
||||
|
||||
void get_urls_css (struct map_context *, int, int);
|
||||
struct urlpos *get_urls_css_file (const char *, const char *);
|
||||
|
||||
#endif /* CSS_URL_H */
|
137
src/css.lex
Normal file
137
src/css.lex
Normal file
@ -0,0 +1,137 @@
|
||||
%option case-insensitive
|
||||
%option noyywrap
|
||||
%option never-interactive
|
||||
|
||||
%{
|
||||
/* Lex source for CSS tokenizing.
|
||||
Taken from http://www.w3.org/TR/CSS21/grammar.html#q2
|
||||
Copyright (C) 2006 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Wget; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
In addition, as a special exception, the Free Software Foundation
|
||||
gives permission to link the code of its release of Wget with the
|
||||
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||
that use the same license as the "OpenSSL" library), and distribute
|
||||
the linked executables. You must obey the GNU General Public License
|
||||
in all respects for all of the code used other than "OpenSSL". If you
|
||||
modify this file, you may extend this exception to your version of the
|
||||
file, but you are not obligated to do so. If you do not wish to do
|
||||
so, delete this exception statement from your version. */
|
||||
|
||||
#include "css-tokens.h"
|
||||
|
||||
/* {s}+\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {unput(' '); } */
|
||||
/*replace by space*/
|
||||
%}
|
||||
|
||||
h [0-9a-f]
|
||||
nonascii [\200-\377]
|
||||
unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])?
|
||||
escape {unicode}|\\[^\r\n\f0-9a-f]
|
||||
nmstart [_a-z]|{nonascii}|{escape}
|
||||
nmchar [_a-z0-9-]|{nonascii}|{escape}
|
||||
string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
|
||||
string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\'
|
||||
invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})*
|
||||
invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})*
|
||||
|
||||
comment \/\*[^*]*\*+([^/*][^*]*\*+)*\/
|
||||
ident -?{nmstart}{nmchar}*
|
||||
name {nmchar}+
|
||||
num [0-9]+|[0-9]*"."[0-9]+
|
||||
string {string1}|{string2}
|
||||
invalid {invalid1}|{invalid2}
|
||||
url ([!#$%&*-~]|{nonascii}|{escape})*
|
||||
s [ \t\r\n\f]
|
||||
w ({s}|{comment})*
|
||||
nl \n|\r\n|\r|\f
|
||||
|
||||
A a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])?
|
||||
C c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])?
|
||||
D d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])?
|
||||
E e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])?
|
||||
G g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g
|
||||
H h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h
|
||||
I i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i
|
||||
K k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k
|
||||
M m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m
|
||||
N n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n
|
||||
P p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p
|
||||
R r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r
|
||||
S s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s
|
||||
T t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t
|
||||
X x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x
|
||||
Z z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z
|
||||
|
||||
%%
|
||||
|
||||
{s} {return S;}
|
||||
|
||||
\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {return S;} /* ignore comments */
|
||||
|
||||
"<!--" {return CDO;}
|
||||
"-->" {return CDC;}
|
||||
"~=" {return INCLUDES;}
|
||||
"|=" {return DASHMATCH;}
|
||||
|
||||
{w}"{" {return LBRACE;}
|
||||
{w}"+" {return PLUS;}
|
||||
{w}">" {return GREATER;}
|
||||
{w}"," {return COMMA;}
|
||||
|
||||
{string} {return STRING;}
|
||||
{invalid} {return INVALID; /* unclosed string */}
|
||||
|
||||
{ident} {return IDENT;}
|
||||
|
||||
"#"{name} {return HASH;}
|
||||
|
||||
"@import" {return IMPORT_SYM;}
|
||||
"@page" {return PAGE_SYM;}
|
||||
"@media" {return MEDIA_SYM;}
|
||||
"@charset " {return CHARSET_SYM;}
|
||||
|
||||
"!"{w}"important" {return IMPORTANT_SYM;}
|
||||
|
||||
{num}{E}{M} {return EMS;}
|
||||
{num}{E}{X} {return EXS;}
|
||||
{num}{P}{X} {return LENGTH;}
|
||||
{num}{C}{M} {return LENGTH;}
|
||||
{num}{M}{M} {return LENGTH;}
|
||||
{num}{I}{N} {return LENGTH;}
|
||||
{num}{P}{T} {return LENGTH;}
|
||||
{num}{P}{C} {return LENGTH;}
|
||||
{num}{D}{E}{G} {return ANGLE;}
|
||||
{num}{R}{A}{D} {return ANGLE;}
|
||||
{num}{G}{R}{A}{D} {return ANGLE;}
|
||||
{num}{M}{S} {return TIME;}
|
||||
{num}{S} {return TIME;}
|
||||
{num}{H}{Z} {return FREQ;}
|
||||
{num}{K}{H}{Z} {return FREQ;}
|
||||
{num}{ident} {return DIMENSION;}
|
||||
|
||||
{num}% {return PERCENTAGE;}
|
||||
{num} {return NUMBER;}
|
||||
|
||||
"url("{w}{string}{w}")" {return URI;}
|
||||
"url("{w}{url}{w}")" {return URI;}
|
||||
{ident}"(" {return FUNCTION;}
|
||||
|
||||
. {return *yytext;}
|
||||
|
||||
%%
|
128
src/html-parse.c
128
src/html-parse.c
@ -271,6 +271,94 @@ struct pool {
|
||||
to "<foo", but "<,foo" to "<,foo". */
|
||||
#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
|
||||
|
||||
struct tagstack_item {
|
||||
const char *tagname_begin;
|
||||
const char *tagname_end;
|
||||
const char *contents_begin;
|
||||
struct tagstack_item *prev;
|
||||
struct tagstack_item *next;
|
||||
};
|
||||
|
||||
struct tagstack_item *
|
||||
tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
|
||||
{
|
||||
struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
|
||||
if (*head == NULL)
|
||||
{
|
||||
*head = *tail = ts;
|
||||
ts->prev = ts->next = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
(*tail)->next = ts;
|
||||
ts->prev = *tail;
|
||||
*tail = ts;
|
||||
ts->next = NULL;
|
||||
}
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
/* remove ts and everything after it from the stack */
|
||||
void
|
||||
tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
|
||||
struct tagstack_item *ts)
|
||||
{
|
||||
if (*head == NULL)
|
||||
return;
|
||||
|
||||
if (ts == *tail)
|
||||
{
|
||||
if (ts == *head)
|
||||
{
|
||||
xfree (ts);
|
||||
*head = *tail = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
ts->prev->next = NULL;
|
||||
*tail = ts->prev;
|
||||
xfree (ts);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ts == *head)
|
||||
{
|
||||
*head = NULL;
|
||||
}
|
||||
*tail = ts->prev;
|
||||
|
||||
if (ts->prev)
|
||||
{
|
||||
ts->prev->next = NULL;
|
||||
}
|
||||
while (ts)
|
||||
{
|
||||
struct tagstack_item *p = ts->next;
|
||||
xfree (ts);
|
||||
ts = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct tagstack_item *
|
||||
tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
|
||||
const char *tagname_end)
|
||||
{
|
||||
int len = tagname_end - tagname_begin;
|
||||
while (tail)
|
||||
{
|
||||
if (len == (tail->tagname_end - tail->tagname_begin))
|
||||
{
|
||||
if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
|
||||
return tail;
|
||||
}
|
||||
tail = tail->prev;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Decode the HTML character entity at *PTR, considering END to be end
|
||||
of buffer. It is assumed that the "&" character that marks the
|
||||
beginning of the entity has been seen at *PTR-1. If a recognized
|
||||
@ -756,6 +844,9 @@ map_html_tags (const char *text, int size,
|
||||
bool attr_pair_resized = false;
|
||||
struct attr_pair *pairs = attr_pair_initial_storage;
|
||||
|
||||
struct tagstack_item *head = NULL;
|
||||
struct tagstack_item *tail = NULL;
|
||||
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
@ -822,6 +913,18 @@ map_html_tags (const char *text, int size,
|
||||
goto look_for_tag;
|
||||
tag_name_end = p;
|
||||
SKIP_WS (p);
|
||||
|
||||
if (!end_tag)
|
||||
{
|
||||
struct tagstack_item *ts = tagstack_push (&head, &tail);
|
||||
if (ts)
|
||||
{
|
||||
ts->tagname_begin = tag_name_begin;
|
||||
ts->tagname_end = tag_name_end;
|
||||
ts->contents_begin = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (end_tag && *p != '>')
|
||||
goto backout_tag;
|
||||
|
||||
@ -983,6 +1086,11 @@ map_html_tags (const char *text, int size,
|
||||
++nattrs;
|
||||
}
|
||||
|
||||
if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
|
||||
{
|
||||
tail->contents_begin = p+1;
|
||||
}
|
||||
|
||||
if (uninteresting_tag)
|
||||
{
|
||||
ADVANCE (p);
|
||||
@ -994,6 +1102,7 @@ map_html_tags (const char *text, int size,
|
||||
{
|
||||
int i;
|
||||
struct taginfo taginfo;
|
||||
struct tagstack_item *ts = NULL;
|
||||
|
||||
taginfo.name = pool.contents;
|
||||
taginfo.end_tag_p = end_tag;
|
||||
@ -1010,6 +1119,23 @@ map_html_tags (const char *text, int size,
|
||||
taginfo.attrs = pairs;
|
||||
taginfo.start_position = tag_start_position;
|
||||
taginfo.end_position = p + 1;
|
||||
taginfo.contents_begin = NULL;
|
||||
taginfo.contents_end = NULL;
|
||||
|
||||
if (end_tag)
|
||||
{
|
||||
ts = tagstack_find (tail, tag_name_begin, tag_name_end);
|
||||
if (ts)
|
||||
{
|
||||
if (ts->contents_begin)
|
||||
{
|
||||
taginfo.contents_begin = ts->contents_begin;
|
||||
taginfo.contents_end = tag_start_position;
|
||||
}
|
||||
tagstack_pop (&head, &tail, ts);
|
||||
}
|
||||
}
|
||||
|
||||
mapfun (&taginfo, maparg);
|
||||
ADVANCE (p);
|
||||
}
|
||||
@ -1029,6 +1155,8 @@ map_html_tags (const char *text, int size,
|
||||
POOL_FREE (&pool);
|
||||
if (attr_pair_resized)
|
||||
xfree (pairs);
|
||||
/* pop any tag stack that's left */
|
||||
tagstack_pop (&head, &tail, head);
|
||||
}
|
||||
|
||||
#undef ADVANCE
|
||||
|
@ -51,6 +51,9 @@ struct taginfo {
|
||||
|
||||
const char *start_position; /* start position of tag */
|
||||
const char *end_position; /* end position of tag */
|
||||
|
||||
const char *contents_begin; /* delimiters of tag contents */
|
||||
const char *contents_end; /* only valid if end_tag_p */
|
||||
};
|
||||
|
||||
struct hash_table; /* forward declaration */
|
||||
|
107
src/html-url.c
107
src/html-url.c
@ -41,9 +41,9 @@ so, delete this exception statement from your version. */
|
||||
#include "utils.h"
|
||||
#include "hash.h"
|
||||
#include "convert.h"
|
||||
#include "recur.h" /* declaration of get_urls_html */
|
||||
|
||||
struct map_context;
|
||||
#include "recur.h"
|
||||
#include "html-url.h"
|
||||
#include "css-url.h"
|
||||
|
||||
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
|
||||
|
||||
@ -167,7 +167,8 @@ static const char *additional_attributes[] = {
|
||||
"http-equiv", /* used by tag_handle_meta */
|
||||
"name", /* used by tag_handle_meta */
|
||||
"content", /* used by tag_handle_meta */
|
||||
"action" /* used by tag_handle_form */
|
||||
"action", /* used by tag_handle_form */
|
||||
"style" /* used by check_style_attr */
|
||||
};
|
||||
|
||||
static struct hash_table *interesting_tags;
|
||||
@ -246,28 +247,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct map_context {
|
||||
char *text; /* HTML text. */
|
||||
char *base; /* Base URI of the document, possibly
|
||||
changed through <base href=...>. */
|
||||
const char *parent_base; /* Base of the current document. */
|
||||
const char *document_file; /* File name of this document. */
|
||||
bool nofollow; /* whether NOFOLLOW was specified in a
|
||||
<meta name=robots> tag. */
|
||||
|
||||
struct urlpos *head, *tail; /* List of URLs that is being
|
||||
built. */
|
||||
};
|
||||
/* used for calls to append_url */
|
||||
#define ATTR_POS(tag, attrind, ctx) \
|
||||
(tag->attrs[attrind].value_raw_beginning - ctx->text)
|
||||
#define ATTR_SIZE(tag, attrind) \
|
||||
(tag->attrs[attrind].value_raw_size)
|
||||
|
||||
/* Append LINK_URI to the urlpos structure that is being built.
|
||||
|
||||
LINK_URI will be merged with the current document base. TAG and
|
||||
ATTRIND are the necessary context to store the position and
|
||||
size. */
|
||||
LINK_URI will be merged with the current document base.
|
||||
*/
|
||||
|
||||
static struct urlpos *
|
||||
append_url (const char *link_uri,
|
||||
struct taginfo *tag, int attrind, struct map_context *ctx)
|
||||
struct urlpos *
|
||||
append_url (const char *link_uri, int position, int size,
|
||||
struct map_context *ctx)
|
||||
{
|
||||
int link_has_scheme = url_has_scheme (link_uri);
|
||||
struct urlpos *newel;
|
||||
@ -325,8 +318,8 @@ append_url (const char *link_uri,
|
||||
|
||||
newel = xnew0 (struct urlpos);
|
||||
newel->url = url;
|
||||
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
|
||||
newel->size = tag->attrs[attrind].value_raw_size;
|
||||
newel->pos = position;
|
||||
newel->size = size;
|
||||
|
||||
/* A URL is relative if the host is not named, and the name does not
|
||||
start with `/'. */
|
||||
@ -346,6 +339,18 @@ append_url (const char *link_uri,
|
||||
return newel;
|
||||
}
|
||||
|
||||
static void
|
||||
check_style_attr (struct taginfo *tag, struct map_context *ctx)
|
||||
{
|
||||
int attrind;
|
||||
char *style = find_attr (tag, "style", &attrind);
|
||||
if (!style)
|
||||
return;
|
||||
|
||||
/* raw pos and raw size include the quotes, hence the +1 -2 */
|
||||
get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
|
||||
}
|
||||
|
||||
/* All the tag_* functions are called from collect_tags_mapper, as
|
||||
specified by KNOWN_TAGS. */
|
||||
|
||||
@ -393,7 +398,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
if (0 == strcasecmp (tag->attrs[attrind].name,
|
||||
tag_url_attributes[i].attr_name))
|
||||
{
|
||||
struct urlpos *up = append_url (link, tag, attrind, ctx);
|
||||
struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
|
||||
ATTR_SIZE(tag,attrind), ctx);
|
||||
if (up)
|
||||
{
|
||||
int flags = tag_url_attributes[i].flags;
|
||||
@ -418,7 +424,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
if (!newbase)
|
||||
return;
|
||||
|
||||
base_urlpos = append_url (newbase, tag, attrind, ctx);
|
||||
base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
|
||||
ATTR_SIZE(tag,attrind), ctx);
|
||||
if (!base_urlpos)
|
||||
return;
|
||||
base_urlpos->ignore_when_downloading = 1;
|
||||
@ -439,9 +446,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
{
|
||||
int attrind;
|
||||
char *action = find_attr (tag, "action", &attrind);
|
||||
|
||||
if (action)
|
||||
{
|
||||
struct urlpos *up = append_url (action, tag, attrind, ctx);
|
||||
struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
|
||||
ATTR_SIZE(tag,attrind), ctx);
|
||||
if (up)
|
||||
up->ignore_when_downloading = 1;
|
||||
}
|
||||
@ -464,14 +473,23 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
*/
|
||||
if (href)
|
||||
{
|
||||
struct urlpos *up = append_url (href, tag, attrind, ctx);
|
||||
struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
|
||||
ATTR_SIZE(tag,attrind), ctx);
|
||||
if (up)
|
||||
{
|
||||
char *rel = find_attr (tag, "rel", NULL);
|
||||
if (rel
|
||||
&& (0 == strcasecmp (rel, "stylesheet")
|
||||
|| 0 == strcasecmp (rel, "shortcut icon")))
|
||||
if (rel)
|
||||
{
|
||||
if (0 == strcasecmp (rel, "stylesheet"))
|
||||
{
|
||||
up->link_inline_p = 1;
|
||||
up->link_expect_css = 1;
|
||||
}
|
||||
else if (0 == strcasecmp (rel, "shortcut icon"))
|
||||
{
|
||||
up->link_inline_p = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* The external ones usually point to HTML pages, such as
|
||||
<link rel="next" href="..."> */
|
||||
@ -525,7 +543,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
|
||||
while (ISSPACE (*p))
|
||||
++p;
|
||||
|
||||
entry = append_url (p, tag, attrind, ctx);
|
||||
entry = append_url (p, ATTR_POS(tag,attrind,ctx),
|
||||
ATTR_SIZE(tag,attrind), ctx);
|
||||
if (entry)
|
||||
{
|
||||
entry->link_refresh_p = 1;
|
||||
@ -570,11 +589,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
|
||||
struct map_context *ctx = (struct map_context *)arg;
|
||||
|
||||
/* Find the tag in our table of tags. This must not fail because
|
||||
map_html_tags only returns tags found in interesting_tags. */
|
||||
struct known_tag *t = hash_table_get (interesting_tags, tag->name);
|
||||
assert (t != NULL);
|
||||
map_html_tags only returns tags found in interesting_tags.
|
||||
|
||||
I've changed this for now, I'm passing NULL as interesting_tags
|
||||
to map_html_tags. This way we can check all tags for a style
|
||||
attribute.
|
||||
*/
|
||||
struct known_tag *t = hash_table_get (interesting_tags, tag->name);
|
||||
|
||||
if (t != NULL)
|
||||
t->handler (t->tagid, tag, ctx);
|
||||
|
||||
check_style_attr (tag, ctx);
|
||||
|
||||
if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
|
||||
tag->contents_begin && tag->contents_end)
|
||||
{
|
||||
/* parse contents */
|
||||
get_urls_css (ctx, tag->contents_begin - ctx->text,
|
||||
tag->contents_end - tag->contents_begin);
|
||||
}
|
||||
}
|
||||
|
||||
/* Analyze HTML tags FILE and construct a list of URLs referenced from
|
||||
@ -618,8 +652,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
|
||||
if (opt.strict_comments)
|
||||
flags |= MHT_STRICT_COMMENTS;
|
||||
|
||||
/* the NULL here used to be interesting_tags */
|
||||
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
|
||||
interesting_tags, interesting_attributes);
|
||||
NULL, interesting_attributes);
|
||||
|
||||
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
|
||||
if (meta_disallow_follow)
|
||||
|
51
src/html-url.h
Normal file
51
src/html-url.h
Normal file
@ -0,0 +1,51 @@
|
||||
/* Declarations for html-url.c.
|
||||
Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GNU Wget.
|
||||
|
||||
GNU Wget is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
GNU Wget is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Wget; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
In addition, as a special exception, the Free Software Foundation
|
||||
gives permission to link the code of its release of Wget with the
|
||||
OpenSSL project's "OpenSSL" library (or with modified versions of it
|
||||
that use the same license as the "OpenSSL" library), and distribute
|
||||
the linked executables. You must obey the GNU General Public License
|
||||
in all respects for all of the code used other than "OpenSSL". If you
|
||||
modify this file, you may extend this exception to your version of the
|
||||
file, but you are not obligated to do so. If you do not wish to do
|
||||
so, delete this exception statement from your version. */
|
||||
|
||||
#ifndef HTML_URL_H
|
||||
#define HTML_URL_H
|
||||
|
||||
struct map_context {
|
||||
char *text; /* HTML text. */
|
||||
char *base; /* Base URI of the document, possibly
|
||||
changed through <base href=...>. */
|
||||
const char *parent_base; /* Base of the current document. */
|
||||
const char *document_file; /* File name of this document. */
|
||||
bool nofollow; /* whether NOFOLLOW was specified in a
|
||||
<meta name=robots> tag. */
|
||||
|
||||
struct urlpos *head, *tail; /* List of URLs that is being
|
||||
built. */
|
||||
};
|
||||
|
||||
struct urlpos *get_urls_file (const char *);
|
||||
struct urlpos *get_urls_html (const char *, const char *, bool *);
|
||||
struct urlpos *append_url (const char *, int, int, struct map_context *);
|
||||
void free_urlpos (struct urlpos *);
|
||||
|
||||
#endif /* HTML_URL_H */
|
77
src/http.c
77
src/http.c
@ -77,6 +77,7 @@ static struct cookie_jar *wget_cookie_jar;
|
||||
|
||||
#define TEXTHTML_S "text/html"
|
||||
#define TEXTXHTML_S "application/xhtml+xml"
|
||||
#define TEXTCSS_S "text/css"
|
||||
|
||||
/* Some status code validation macros: */
|
||||
#define H_20X(x) (((x) >= 200) && ((x) < 300))
|
||||
@ -1235,6 +1236,7 @@ static char *create_authorization_line (const char *, const char *,
|
||||
const char *, bool *);
|
||||
static char *basic_authentication_encode (const char *, const char *);
|
||||
static bool known_authentication_scheme_p (const char *, const char *);
|
||||
static void ensure_extension (struct http_stat *, const char *, int *);
|
||||
static void load_cookies (void);
|
||||
|
||||
#define BEGINS_WITH(line, string_constant) \
|
||||
@ -2017,34 +2019,25 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
|
||||
else
|
||||
*dt &= ~TEXTHTML;
|
||||
|
||||
if (opt.html_extension && (*dt & TEXTHTML))
|
||||
/* -E / --html-extension / html_extension = on was specified, and this is a
|
||||
text/html file. If some case-insensitive variation on ".htm[l]" isn't
|
||||
already the file's suffix, tack on ".html". */
|
||||
{
|
||||
char *last_period_in_local_filename = strrchr (hs->local_file, '.');
|
||||
if (type &&
|
||||
0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S)))
|
||||
*dt |= TEXTCSS;
|
||||
else
|
||||
*dt &= ~TEXTCSS;
|
||||
|
||||
if (last_period_in_local_filename == NULL
|
||||
|| !(0 == strcasecmp (last_period_in_local_filename, ".htm")
|
||||
|| 0 == strcasecmp (last_period_in_local_filename, ".html")))
|
||||
if (opt.html_extension)
|
||||
{
|
||||
int local_filename_len = strlen (hs->local_file);
|
||||
/* Resize the local file, allowing for ".html" preceded by
|
||||
optional ".NUMBER". */
|
||||
hs->local_file = xrealloc (hs->local_file,
|
||||
local_filename_len + 24 + sizeof (".html"));
|
||||
strcpy(hs->local_file + local_filename_len, ".html");
|
||||
/* If clobbering is not allowed and the file, as named,
|
||||
exists, tack on ".NUMBER.html" instead. */
|
||||
if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
|
||||
if (*dt & TEXTHTML)
|
||||
/* -E / --html-extension / html_extension = on was specified,
|
||||
and this is a text/html file. If some case-insensitive
|
||||
variation on ".htm[l]" isn't already the file's suffix,
|
||||
tack on ".html". */
|
||||
{
|
||||
int ext_num = 1;
|
||||
do
|
||||
sprintf (hs->local_file + local_filename_len,
|
||||
".%d.html", ext_num++);
|
||||
while (file_exists_p (hs->local_file));
|
||||
ensure_extension (hs, ".html", dt);
|
||||
}
|
||||
*dt |= ADDED_HTML_EXTENSION;
|
||||
else if (*dt & TEXTCSS)
|
||||
{
|
||||
ensure_extension (hs, ".css", dt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3018,6 +3011,42 @@ http_cleanup (void)
|
||||
cookie_jar_delete (wget_cookie_jar);
|
||||
}
|
||||
|
||||
void
|
||||
ensure_extension (struct http_stat *hs, const char *ext, int *dt)
|
||||
{
|
||||
char *last_period_in_local_filename = strrchr (hs->local_file, '.');
|
||||
char shortext[8];
|
||||
int len = strlen (ext);
|
||||
if (len == 5)
|
||||
{
|
||||
strncpy (shortext, ext, len - 1);
|
||||
shortext[len - 2] = '\0';
|
||||
}
|
||||
|
||||
if (last_period_in_local_filename == NULL
|
||||
|| !(0 == strcasecmp (last_period_in_local_filename, shortext)
|
||||
|| 0 == strcasecmp (last_period_in_local_filename, ext)))
|
||||
{
|
||||
int local_filename_len = strlen (hs->local_file);
|
||||
/* Resize the local file, allowing for ".html" preceded by
|
||||
optional ".NUMBER". */
|
||||
hs->local_file = xrealloc (hs->local_file,
|
||||
local_filename_len + 24 + len);
|
||||
strcpy (hs->local_file + local_filename_len, ext);
|
||||
/* If clobbering is not allowed and the file, as named,
|
||||
exists, tack on ".NUMBER.html" instead. */
|
||||
if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
|
||||
{
|
||||
int ext_num = 1;
|
||||
do
|
||||
sprintf (hs->local_file + local_filename_len,
|
||||
".%d%s", ext_num++, ext);
|
||||
while (file_exists_p (hs->local_file));
|
||||
}
|
||||
*dt |= ADDED_HTML_EXTENSION;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef TESTING
|
||||
|
||||
|
58
src/recur.c
58
src/recur.c
@ -48,8 +48,10 @@ so, delete this exception statement from your version. */
|
||||
#include "hash.h"
|
||||
#include "res.h"
|
||||
#include "convert.h"
|
||||
#include "html-url.h"
|
||||
#include "css-url.h"
|
||||
#include "spider.h"
|
||||
|
||||
|
||||
/* Functions for maintaining the URL queue. */
|
||||
|
||||
struct queue_element {
|
||||
@ -58,7 +60,8 @@ struct queue_element {
|
||||
int depth; /* the depth */
|
||||
bool html_allowed; /* whether the document is allowed to
|
||||
be treated as HTML. */
|
||||
|
||||
bool css_allowed; /* whether the document is allowed to
|
||||
be treated as CSS. */
|
||||
struct queue_element *next; /* next element in queue */
|
||||
};
|
||||
|
||||
@ -91,13 +94,15 @@ url_queue_delete (struct url_queue *queue)
|
||||
|
||||
static void
|
||||
url_enqueue (struct url_queue *queue,
|
||||
const char *url, const char *referer, int depth, bool html_allowed)
|
||||
const char *url, const char *referer, int depth,
|
||||
bool html_allowed, bool css_allowed)
|
||||
{
|
||||
struct queue_element *qel = xnew (struct queue_element);
|
||||
qel->url = url;
|
||||
qel->referer = referer;
|
||||
qel->depth = depth;
|
||||
qel->html_allowed = html_allowed;
|
||||
qel->css_allowed = css_allowed;
|
||||
qel->next = NULL;
|
||||
|
||||
++queue->count;
|
||||
@ -121,7 +126,7 @@ url_enqueue (struct url_queue *queue,
|
||||
static bool
|
||||
url_dequeue (struct url_queue *queue,
|
||||
const char **url, const char **referer, int *depth,
|
||||
bool *html_allowed)
|
||||
bool *html_allowed, bool *css_allowed)
|
||||
{
|
||||
struct queue_element *qel = queue->head;
|
||||
|
||||
@ -136,6 +141,7 @@ url_dequeue (struct url_queue *queue,
|
||||
*referer = qel->referer;
|
||||
*depth = qel->depth;
|
||||
*html_allowed = qel->html_allowed;
|
||||
*css_allowed = qel->css_allowed;
|
||||
|
||||
--queue->count;
|
||||
|
||||
@ -200,7 +206,7 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
/* Enqueue the starting URL. Use start_url_parsed->url rather than
|
||||
just URL so we enqueue the canonical form of the URL. */
|
||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
|
||||
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
|
||||
string_set_add (blacklist, start_url_parsed->url);
|
||||
|
||||
while (1)
|
||||
@ -208,7 +214,8 @@ retrieve_tree (const char *start_url)
|
||||
bool descend = false;
|
||||
char *url, *referer, *file = NULL;
|
||||
int depth;
|
||||
bool html_allowed;
|
||||
bool html_allowed, css_allowed;
|
||||
bool is_css = false;
|
||||
bool dash_p_leaf_HTML = false;
|
||||
|
||||
if (opt.quota && total_downloaded_bytes > opt.quota)
|
||||
@ -220,7 +227,7 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
if (!url_dequeue (queue,
|
||||
(const char **)&url, (const char **)&referer,
|
||||
&depth, &html_allowed))
|
||||
&depth, &html_allowed, &css_allowed))
|
||||
break;
|
||||
|
||||
/* ...and download it. Note that this download is in most cases
|
||||
@ -238,10 +245,21 @@ retrieve_tree (const char *start_url)
|
||||
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
|
||||
url, file));
|
||||
|
||||
/* this sucks, needs to be combined! */
|
||||
if (html_allowed
|
||||
&& downloaded_html_set
|
||||
&& string_set_contains (downloaded_html_set, file))
|
||||
{
|
||||
descend = true;
|
||||
is_css = false;
|
||||
}
|
||||
if (css_allowed
|
||||
&& downloaded_css_set
|
||||
&& string_set_contains (downloaded_css_set, file))
|
||||
{
|
||||
descend = 1;
|
||||
is_css = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -252,7 +270,21 @@ retrieve_tree (const char *start_url)
|
||||
|
||||
if (html_allowed && file && status == RETROK
|
||||
&& (dt & RETROKF) && (dt & TEXTHTML))
|
||||
{
|
||||
descend = true;
|
||||
is_css = false;
|
||||
}
|
||||
|
||||
/* a little different, css_allowed can override content type
|
||||
lots of web servers serve css with an incorrect content type
|
||||
*/
|
||||
if (file && status == RETROK
|
||||
&& (dt & RETROKF) &&
|
||||
((dt & TEXTCSS) || css_allowed))
|
||||
{
|
||||
descend = true;
|
||||
is_css = false;
|
||||
}
|
||||
|
||||
if (redirected)
|
||||
{
|
||||
@ -306,14 +338,15 @@ retrieve_tree (const char *start_url)
|
||||
}
|
||||
}
|
||||
|
||||
/* If the downloaded document was HTML, parse it and enqueue the
|
||||
/* If the downloaded document was HTML or CSS, parse it and enqueue the
|
||||
links it contains. */
|
||||
|
||||
if (descend)
|
||||
{
|
||||
bool meta_disallow_follow = false;
|
||||
struct urlpos *children
|
||||
= get_urls_html (file, url, &meta_disallow_follow);
|
||||
= is_css ? get_urls_css_file (file, url) :
|
||||
get_urls_html (file, url, &meta_disallow_follow);
|
||||
|
||||
if (opt.use_robots && meta_disallow_follow)
|
||||
{
|
||||
@ -338,7 +371,8 @@ retrieve_tree (const char *start_url)
|
||||
{
|
||||
url_enqueue (queue, xstrdup (child->url->url),
|
||||
xstrdup (url), depth + 1,
|
||||
child->link_expect_html);
|
||||
child->link_expect_html,
|
||||
child->link_expect_css);
|
||||
/* We blacklist the URL we have enqueued, because we
|
||||
don't want to enqueue (and hence download) the
|
||||
same URL twice. */
|
||||
@ -385,9 +419,9 @@ retrieve_tree (const char *start_url)
|
||||
{
|
||||
char *d1, *d2;
|
||||
int d3;
|
||||
bool d4;
|
||||
bool d4, d5;
|
||||
while (url_dequeue (queue,
|
||||
(const char **)&d1, (const char **)&d2, &d3, &d4))
|
||||
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
|
||||
{
|
||||
xfree (d1);
|
||||
xfree_null (d2);
|
||||
|
@ -43,9 +43,4 @@ struct urlpos;
|
||||
void recursive_cleanup (void);
|
||||
uerr_t retrieve_tree (const char *);
|
||||
|
||||
/* These are really in html-url.c. */
|
||||
struct urlpos *get_urls_file (const char *);
|
||||
struct urlpos *get_urls_html (const char *, const char *, bool *);
|
||||
void free_urlpos (struct urlpos *);
|
||||
|
||||
#endif /* RECUR_H */
|
||||
|
@ -51,6 +51,7 @@ so, delete this exception statement from your version. */
|
||||
#include "hash.h"
|
||||
#include "convert.h"
|
||||
#include "ptimer.h"
|
||||
#include "html-url.h"
|
||||
|
||||
/* Total size of downloaded files. Used to enforce quota. */
|
||||
SUM_SIZE_INT total_downloaded_bytes;
|
||||
@ -784,6 +785,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
|
||||
register_redirection (origurl, u->url);
|
||||
if (*dt & TEXTHTML)
|
||||
register_html (u->url, local_file);
|
||||
if (*dt & TEXTCSS)
|
||||
register_css (u->url, local_file);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -304,7 +304,8 @@ enum
|
||||
HEAD_ONLY = 0x0004, /* only send the HEAD request */
|
||||
SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */
|
||||
ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */
|
||||
ADDED_HTML_EXTENSION = 0x0020 /* added ".html" extension due to -E */
|
||||
ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */
|
||||
TEXTCSS = 0x0040 /* document is of type text/css */
|
||||
};
|
||||
|
||||
/* Universal error type -- used almost everywhere. Error reporting of
|
||||
|
Loading…
Reference in New Issue
Block a user