mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add support for -accept-regex and --reject-regex.
This commit is contained in:
parent
0aa3c5d33c
commit
f5a1097871
@ -1,3 +1,8 @@
|
||||
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
|
||||
|
||||
* bootstrap.conf (gnulib_modules): Include module `regex'.
|
||||
* configure.ac: Check for PCRE library.
|
||||
|
||||
2012-03-25 Ray Satiro <raysatiro@yahoo.com>
|
||||
|
||||
* configure.ac: Fix build under mingw when OpenSSL is used.
|
||||
|
@ -58,6 +58,7 @@ pipe
|
||||
quote
|
||||
quotearg
|
||||
recv
|
||||
regex
|
||||
select
|
||||
send
|
||||
setsockopt
|
||||
|
12
configure.ac
12
configure.ac
@ -532,6 +532,18 @@ AC_CHECK_HEADER(uuid/uuid.h,
|
||||
])
|
||||
)
|
||||
|
||||
dnl
|
||||
dnl Check for PCRE
|
||||
dnl
|
||||
|
||||
AC_CHECK_HEADER(pcre.h,
|
||||
AC_CHECK_LIB(pcre, pcre_compile,
|
||||
[LIBS="${LIBS} -lpcre"
|
||||
AC_DEFINE([HAVE_LIBPCRE], 1,
|
||||
[Define if libpcre is available.])
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
dnl Needed by src/Makefile.am
|
||||
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
||||
|
@ -1,3 +1,12 @@
|
||||
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
|
||||
|
||||
* init.c: Add --accept-regex, --reject-regex and --regex-type.
|
||||
* main.c: Likewise.
|
||||
* options.c: Likewise.
|
||||
* recur.c: Likewise.
|
||||
* utils.c: Add regex-related functions.
|
||||
* utils.h: Add regex-related functions.
|
||||
|
||||
2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de>
|
||||
|
||||
* convert.c (convert_links_in_hashtable): Mmake it static.
|
||||
|
29
src/init.c
29
src/init.c
@ -46,6 +46,10 @@ as that of the covered work. */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <regex.h>
|
||||
#ifdef HAVE_LIBPCRE
|
||||
# include <pcre.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_PWD_H
|
||||
# include <pwd.h>
|
||||
@ -94,6 +98,7 @@ CMD_DECLARE (cmd_spec_mirror);
|
||||
CMD_DECLARE (cmd_spec_prefer_family);
|
||||
CMD_DECLARE (cmd_spec_progress);
|
||||
CMD_DECLARE (cmd_spec_recursive);
|
||||
CMD_DECLARE (cmd_spec_regex_type);
|
||||
CMD_DECLARE (cmd_spec_restrict_file_names);
|
||||
#ifdef HAVE_SSL
|
||||
CMD_DECLARE (cmd_spec_secure_protocol);
|
||||
@ -116,6 +121,7 @@ static const struct {
|
||||
} commands[] = {
|
||||
/* KEEP THIS LIST ALPHABETICALLY SORTED */
|
||||
{ "accept", &opt.accepts, cmd_vector },
|
||||
{ "acceptregex", &opt.acceptregex_s, cmd_string },
|
||||
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
|
||||
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
|
||||
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
|
||||
@ -236,7 +242,9 @@ static const struct {
|
||||
{ "reclevel", &opt.reclevel, cmd_number_inf },
|
||||
{ "recursive", NULL, cmd_spec_recursive },
|
||||
{ "referer", &opt.referer, cmd_string },
|
||||
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
|
||||
{ "reject", &opt.rejects, cmd_vector },
|
||||
{ "rejectregex", &opt.rejectregex_s, cmd_string },
|
||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
||||
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
||||
@ -361,6 +369,8 @@ defaults (void)
|
||||
opt.restrict_files_nonascii = false;
|
||||
opt.restrict_files_case = restrict_no_case_restriction;
|
||||
|
||||
opt.regex_type = regex_type_posix;
|
||||
|
||||
opt.max_redirect = 20;
|
||||
|
||||
opt.waitretry = 10;
|
||||
@ -1368,6 +1378,25 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Validate --regex-type and set the choice. */
|
||||
|
||||
static bool
|
||||
cmd_spec_regex_type (const char *com, const char *val, void *place_ignored)
|
||||
{
|
||||
static const struct decode_item choices[] = {
|
||||
{ "posix", regex_type_posix },
|
||||
#ifdef HAVE_LIBPCRE
|
||||
{ "pcre", regex_type_pcre },
|
||||
#endif
|
||||
};
|
||||
int regex_type = regex_type_posix;
|
||||
int ok = decode_string (val, choices, countof (choices), ®ex_type);
|
||||
if (!ok)
|
||||
fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
|
||||
opt.regex_type = regex_type;
|
||||
return ok;
|
||||
}
|
||||
|
||||
static bool
|
||||
cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
|
||||
{
|
||||
|
43
src/main.c
43
src/main.c
@ -158,6 +158,7 @@ struct cmdline_option {
|
||||
static struct cmdline_option option_data[] =
|
||||
{
|
||||
{ "accept", 'A', OPT_VALUE, "accept", -1 },
|
||||
{ "accept-regex", 0, OPT_VALUE, "acceptregex", -1 },
|
||||
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
|
||||
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
|
||||
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
|
||||
@ -262,7 +263,9 @@ static struct cmdline_option option_data[] =
|
||||
{ "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
|
||||
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
|
||||
{ "referer", 0, OPT_VALUE, "referer", -1 },
|
||||
{ "regex-type", 0, OPT_VALUE, "regextype", -1 },
|
||||
{ "reject", 'R', OPT_VALUE, "reject", -1 },
|
||||
{ "reject-regex", 0, OPT_VALUE, "rejectregex", -1 },
|
||||
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
|
||||
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
|
||||
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
|
||||
@ -722,6 +725,17 @@ Recursive accept/reject:\n"),
|
||||
-A, --accept=LIST comma-separated list of accepted extensions.\n"),
|
||||
N_("\
|
||||
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
|
||||
N_("\
|
||||
--accept-regex=REGEX regex matching accepted URLs.\n"),
|
||||
N_("\
|
||||
--reject-regex=REGEX regex matching rejected URLs.\n"),
|
||||
#ifdef HAVE_LIBPCRE
|
||||
N_("\
|
||||
--regex-type=TYPE regex type (posix|pcre).\n"),
|
||||
#else
|
||||
N_("\
|
||||
--regex-type=TYPE regex type (posix).\n"),
|
||||
#endif
|
||||
N_("\
|
||||
-D, --domains=LIST comma-separated list of accepted domains.\n"),
|
||||
N_("\
|
||||
@ -1323,6 +1337,35 @@ for details.\n\n"));
|
||||
exit (1);
|
||||
}
|
||||
|
||||
/* Compile the regular expressions. */
|
||||
switch (opt.regex_type)
|
||||
{
|
||||
#ifdef HAVE_LIBPCRE
|
||||
case regex_type_pcre:
|
||||
opt.regex_compile_fun = compile_pcre_regex;
|
||||
opt.regex_match_fun = match_pcre_regex;
|
||||
break;
|
||||
#endif
|
||||
|
||||
case regex_type_posix:
|
||||
default:
|
||||
opt.regex_compile_fun = compile_posix_regex;
|
||||
opt.regex_match_fun = match_posix_regex;
|
||||
break;
|
||||
}
|
||||
if (opt.acceptregex_s)
|
||||
{
|
||||
opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
|
||||
if (!opt.acceptregex)
|
||||
exit (1);
|
||||
}
|
||||
if (opt.rejectregex_s)
|
||||
{
|
||||
opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
|
||||
if (!opt.rejectregex)
|
||||
exit (1);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_IRI
|
||||
if (opt.enable_iri)
|
||||
{
|
||||
|
@ -74,6 +74,19 @@ struct options
|
||||
bool ignore_case; /* Whether to ignore case when
|
||||
matching dirs and files */
|
||||
|
||||
char *acceptregex_s; /* Patterns to accept (a regex string). */
|
||||
char *rejectregex_s; /* Patterns to reject (a regex string). */
|
||||
void *acceptregex; /* Patterns to accept (a regex struct). */
|
||||
void *rejectregex; /* Patterns to reject (a regex struct). */
|
||||
enum {
|
||||
#ifdef HAVE_LIBPCRE
|
||||
regex_type_pcre,
|
||||
#endif
|
||||
regex_type_posix
|
||||
} regex_type; /* The regex library. */
|
||||
void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */
|
||||
bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */
|
||||
|
||||
char **domains; /* See host.c */
|
||||
char **exclude_domains;
|
||||
bool dns_cache; /* whether we cache DNS lookups. */
|
||||
|
@ -586,6 +586,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (!accept_url (url))
|
||||
{
|
||||
DEBUGP (("%s is excluded/not-included through regex.\n", url));
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* 6. Check for acceptance/rejection rules. We ignore these rules
|
||||
for directories (no file name to match) and for non-leaf HTMLs,
|
||||
|
101
src/utils.c
101
src/utils.c
@ -73,6 +73,11 @@ as that of the covered work. */
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
#include <regex.h>
|
||||
#ifdef HAVE_LIBPCRE
|
||||
# include <pcre.h>
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_SIGSETJMP
|
||||
/* If sigsetjmp is a macro, configure won't pick it up. */
|
||||
# ifdef sigsetjmp
|
||||
@ -917,6 +922,19 @@ acceptable (const char *s)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Determine whether an URL is acceptable to be followed, according to
|
||||
regex patterns to accept/reject. */
|
||||
bool
|
||||
accept_url (const char *s)
|
||||
{
|
||||
if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
|
||||
return false;
|
||||
if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
|
||||
will return true if and only if D2 begins with `/something/' or is exactly
|
||||
'/something'. */
|
||||
@ -2309,6 +2327,89 @@ base64_decode (const char *base64, void *dest)
|
||||
return q - (char *) dest;
|
||||
}
|
||||
|
||||
#ifdef HAVE_LIBPCRE
|
||||
/* Compiles the PCRE regex. */
|
||||
void *
|
||||
compile_pcre_regex (const char *str)
|
||||
{
|
||||
const char *errbuf;
|
||||
int erroffset;
|
||||
pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0);
|
||||
if (! regex)
|
||||
{
|
||||
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
|
||||
quote (str), errbuf);
|
||||
return false;
|
||||
}
|
||||
return regex;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Compiles the POSIX regex. */
|
||||
void *
|
||||
compile_posix_regex (const char *str)
|
||||
{
|
||||
regex_t *regex = xmalloc (sizeof (regex_t));
|
||||
int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB);
|
||||
if (errcode != 0)
|
||||
{
|
||||
int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0);
|
||||
char *errbuf = xmalloc (errbuf_size);
|
||||
errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size);
|
||||
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
|
||||
quote (str), errbuf);
|
||||
xfree (errbuf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return regex;
|
||||
}
|
||||
|
||||
#ifdef HAVE_LIBPCRE
|
||||
#define OVECCOUNT 30
|
||||
/* Matches a PCRE regex. */
|
||||
bool
|
||||
match_pcre_regex (const void *regex, const char *str)
|
||||
{
|
||||
int l = strlen (str);
|
||||
int ovector[OVECCOUNT];
|
||||
|
||||
int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT);
|
||||
if (rc == PCRE_ERROR_NOMATCH)
|
||||
return false;
|
||||
else if (rc < 0)
|
||||
{
|
||||
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
|
||||
quote (str), rc);
|
||||
return false;
|
||||
}
|
||||
else
|
||||
return true;
|
||||
}
|
||||
#undef OVECCOUNT
|
||||
#endif
|
||||
|
||||
/* Matches a POSIX regex. */
|
||||
bool
|
||||
match_posix_regex (const void *regex, const char *str)
|
||||
{
|
||||
int rc = regexec ((regex_t *) regex, str, 0, NULL, 0);
|
||||
if (rc == REG_NOMATCH)
|
||||
return false;
|
||||
else if (rc == 0)
|
||||
return true;
|
||||
else
|
||||
{
|
||||
int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
|
||||
char *errbuf = xmalloc (errbuf_size);
|
||||
errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
|
||||
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
|
||||
quote (str), rc);
|
||||
xfree (errbuf);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef IS_ASCII
|
||||
#undef NEXT_CHAR
|
||||
|
||||
|
@ -90,6 +90,7 @@ char *file_merge (const char *, const char *);
|
||||
|
||||
int fnmatch_nocase (const char *, const char *, int);
|
||||
bool acceptable (const char *);
|
||||
bool accept_url (const char *);
|
||||
bool accdir (const char *s);
|
||||
char *suffix (const char *s);
|
||||
bool match_tail (const char *, const char *, bool);
|
||||
@ -142,6 +143,14 @@ void xsleep (double);
|
||||
int base64_encode (const void *, int, char *);
|
||||
int base64_decode (const char *, void *);
|
||||
|
||||
#ifdef HAVE_LIBPCRE
|
||||
void *compile_pcre_regex (const char *);
|
||||
bool match_pcre_regex (const void *, const char *);
|
||||
#endif
|
||||
|
||||
void *compile_posix_regex (const char *);
|
||||
bool match_posix_regex (const void *, const char *);
|
||||
|
||||
void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
|
||||
|
||||
const char *print_decimal (double);
|
||||
|
Loading…
Reference in New Issue
Block a user